<a href="https://colab.research.google.com/github/SRARNAB7/HDS_5320_Group_Project/blob/main/Practice_Project%20Arnab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder # To work with the categorical Features in the dataset
from imblearn.over_sampling import SMOTE # To do sampling when required for class imbalace
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle # for saving the models


In [2]:
#Reading the csv file
df = pd.read_csv("train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               800 non-null    int64  
 1   A1_Score         800 non-null    int64  
 2   A2_Score         800 non-null    int64  
 3   A3_Score         800 non-null    int64  
 4   A4_Score         800 non-null    int64  
 5   A5_Score         800 non-null    int64  
 6   A6_Score         800 non-null    int64  
 7   A7_Score         800 non-null    int64  
 8   A8_Score         800 non-null    int64  
 9   A9_Score         800 non-null    int64  
 10  A10_Score        800 non-null    int64  
 11  age              800 non-null    float64
 12  gender           800 non-null    object 
 13  ethnicity        800 non-null    object 
 14  jaundice         800 non-null    object 
 15  austim           800 non-null    object 
 16  contry_of_res    800 non-null    object 
 17  used_app_before 

In [4]:
#convert Age column to int
df["age"] = df["age"].astype(int)

In [5]:
for col in df.columns:
  numerical_features = ["ID","age","result"]
  if col not in numerical_features:
    print(col, df[col].unique())
    print("_"*50)


A1_Score [1 0]
__________________________________________________
A2_Score [0 1]
__________________________________________________
A3_Score [1 0]
__________________________________________________
A4_Score [0 1]
__________________________________________________
A5_Score [1 0]
__________________________________________________
A6_Score [0 1]
__________________________________________________
A7_Score [1 0]
__________________________________________________
A8_Score [0 1]
__________________________________________________
A9_Score [1 0]
__________________________________________________
A10_Score [1 0]
__________________________________________________
gender ['f' 'm']
__________________________________________________
ethnicity ['?' 'White-European' 'Middle Eastern ' 'Pasifika' 'Black' 'Others'
 'Hispanic' 'Asian' 'Turkish' 'South Asian' 'Latino' 'others']
__________________________________________________
jaundice ['no' 'yes']
__________________________________________________
austim

In [6]:
#dropping ID and age_desc columns
df = df.drop(columns = ["ID","age_desc"])


In [7]:
#define the mapping dictionary for country names

mapping = {
    "Viet Nam" : "Vietnam",
    "AmericanSamoa" : "United States",
    "Hong Kong" : "China"
}

df["contry_of_res"] = df["contry_of_res"].replace(mapping)

In [8]:
#Target Class Distribution
df["Class/ASD"].value_counts()

Unnamed: 0_level_0,count
Class/ASD,Unnamed: 1_level_1
0,639
1,161


In [9]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,relation,Class/ASD
0,1,0,1,0,1,0,1,0,1,1,38,f,?,no,no,Austria,no,6.351166,Self,0
1,0,0,0,0,0,0,0,0,0,0,47,m,?,no,no,India,no,2.255185,Self,0
2,1,1,1,1,1,1,1,1,1,1,7,m,White-European,no,yes,United States,no,14.851484,Self,1
3,0,0,0,0,0,0,0,0,0,0,23,f,?,no,no,United States,no,2.276617,Self,0
4,0,0,0,0,0,0,0,0,0,0,43,m,?,no,no,South Africa,no,-4.777286,Self,0


In [10]:
df["ethnicity"] = df["ethnicity"].replace({"?": "Others", "others": "Others"})

In [11]:
df["relation"] = df["relation"].replace({"?": "Others",
                                          "Relative": "Others",
                                          "Parent": "Others",
                                          "Health care professional" : "Others"})

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         800 non-null    int64  
 1   A2_Score         800 non-null    int64  
 2   A3_Score         800 non-null    int64  
 3   A4_Score         800 non-null    int64  
 4   A5_Score         800 non-null    int64  
 5   A6_Score         800 non-null    int64  
 6   A7_Score         800 non-null    int64  
 7   A8_Score         800 non-null    int64  
 8   A9_Score         800 non-null    int64  
 9   A10_Score        800 non-null    int64  
 10  age              800 non-null    int64  
 11  gender           800 non-null    object 
 12  ethnicity        800 non-null    object 
 13  jaundice         800 non-null    object 
 14  austim           800 non-null    object 
 15  contry_of_res    800 non-null    object 
 16  used_app_before  800 non-null    object 
 17  result          

In [13]:
df["Class/ASD"].isnull().any()

np.False_

In [14]:
#Identify columns with "object" data type
object_columns = df.select_dtypes(include = ["object"]).columns
object_columns

Index(['gender', 'ethnicity', 'jaundice', 'austim', 'contry_of_res',
       'used_app_before', 'relation'],
      dtype='object')

In [15]:
#Initialize a dictionary  to store the encoders
encoders = {}
#Apply label encoding and store the encoders
for column in object_columns:
  label_encoder = LabelEncoder()
  df[column]= label_encoder.fit_transform(df[column])
  encoders[column] = label_encoder

 # save the encoders as a pickle file
  with open("encoders.pkl", "wb") as f:
   pickle.dump(encoders,f)

FEATURE SELECTION

In [16]:
import pandas as pd
from sklearn.feature_selection import SelectKBest,chi2,f_classif
from numpy import set_printoptions

y = df["Class/ASD"]

# ----- Features -----
features = ['age','gender','ethnicity','jaundice','austim',
            'contry_of_res','used_app_before','result','relation']

X = df[features]

# Split numeric continuous vs categorical
categorical_cols = ['gender','ethnicity','jaundice','austim',
                    'contry_of_res','used_app_before','relation']
continuous_cols = ['age','result']

# --- Chi2 for categorical-like features ---
X_cat = X[categorical_cols].clip(lower =0) # chi2 needs non-negative
chi2_selector = SelectKBest(score_func = chi2 , k = 'all')
chi2_fit = chi2_selector.fit(X_cat, y)

chi2_scores = pd.DataFrame({
    'Feature' : categorical_cols,
    'Chi2 Score' : chi2_fit.scores_
}).sort_values(by = 'Chi2 Score', ascending = False)

print("\nChi2 scores(Categorical Features:\n", chi2_scores)

# --- ANOVA F-test for continuous features ---

X_cont = X[continuous_cols]
f_selector = SelectKBest(score_func= f_classif, k = 'all')
f_fit = f_selector.fit(X_cont,y)

f_scores = pd.DataFrame({
    'Feature': continuous_cols,
    'F-score': f_fit.scores_
}).sort_values(by = 'F-score', ascending = False)

print("\nANOVA F-Scores (continuous features):\n", f_scores)




Chi2 scores(Categorical Features:
            Feature  Chi2 Score
4    contry_of_res  292.171721
1        ethnicity  160.243609
3           austim   86.325667
2         jaundice   11.845468
5  used_app_before    1.073591
6         relation    0.163274
0           gender    0.005152

ANOVA F-Scores (continuous features):
   Feature     F-score
1  result  112.790540
0     age    9.741231


In [17]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

y = df['Class/ASD']
X = df.drop(columns = ['Class/ASD'])

# One-hot encode categorical variables so Logistic Regression can handle them
X = pd.get_dummies(X, drop_first = True)

# Define base model
model = LogisticRegression(max_iter = 1000,solver = 'liblinear')

# Apply RFE
rfe = RFE(model, n_features_to_select= 8)
fit = rfe.fit(X,y)

# Rankings: 1 = selected feature, higher = less important
ranking = pd.DataFrame({
    'Feature': X.columns,
    'Ranking': fit.ranking_
}).sort_values(by= 'Ranking')

print("\n Feature Ranking:\:", ranking)

# Show selected features
selected = X.columns[fit.support_]
print("\n Selected top 3 Features:\n", selected.tolist())


 Feature Ranking:\:             Feature  Ranking
3          A4_Score        1
2          A3_Score        1
5          A6_Score        1
4          A5_Score        1
6          A7_Score        1
7          A8_Score        1
8          A9_Score        1
18         relation        1
1          A2_Score        2
14           austim        3
0          A1_Score        4
11           gender        5
13         jaundice        6
9         A10_Score        7
12        ethnicity        8
16  used_app_before        9
17           result       10
10              age       11
15    contry_of_res       12

 Selected top 3 Features:
 ['A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'relation']


  print("\n Feature Ranking:\:", ranking)


In [19]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier

y = df['Class/ASD']
X = df.drop(columns= ['Class/ASD'])

model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)

[0.02279495 0.02062564 0.04901418 0.09166061 0.06483354 0.12129751
 0.0459287  0.04024718 0.05954977 0.01598367 0.09817717 0.03194289
 0.06276771 0.03065162 0.03512113 0.0800464  0.01076841 0.09933575
 0.01925318]


FITTING THE MODELS

In [30]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

#Feartures and Target
X = df[['result','austim','jaundice']]
y = df['Class/ASD']

#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# Define and fit the model
logr= linear_model.LogisticRegression(max_iter=1000)
logr.fit(X_train,y_train)

# Confusion matrix and accuracy
y_pred = logr.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy:", acc)


Confusion Matrix:
 [[180   6]
 [ 37  17]]
Accuracy: 0.8208333333333333


In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

X = df[['result','austim','jaundice']]
y = df["Class/ASD"]

# Define and fit logistic regression model
model = LogisticRegression(solver = 'liblinear', C= 10, random_state = 0, max_iter=1000)
model.fit(X,y)

# Predictions
y_pred = model.predict(X)

accuracy = model.score(X,y)
conf_m = confusion_matrix(y, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_m)


Accuracy: 0.8225
Confusion Matrix:
 [[604  35]
 [107  54]]


In [33]:
# Import everything required for KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

knn = Pipeline([("scaler", StandardScaler()),
                ("clf", KNeighborsClassifier(n_neighbors = 5))])

knn.fit(X_train, y_train)


