# Problem:
Autism detection based on parameters like Age, Ethnicity, AQ_Score, Country, etc.

## Data Collection

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore') # Setting to ignore warnings

In [2]:
# Load dataset
data = pd.read_csv(r"C:\Users\sukhd\OneDrive\Desktop\Summer Training\Capstone Project\Autism_dataset_cleaned.csv")
data.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,Age,Gender,Ethnicity,Jaundice,Autism,Country_of_res,Used_app_before,Result,Relation,Class/ASD
0,1,0,1,0,1,0,1,0,1,1,38.1727,f,Unknown,no,no,Austria,no,6.35117,Self,0
1,0,0,0,0,0,0,0,0,0,0,47.7505,m,Unknown,no,no,India,no,2.25519,Self,0
2,1,1,1,1,1,1,1,1,1,1,7.38037,m,White-European,no,yes,United States,no,14.8515,Self,1
3,0,0,0,0,0,0,0,0,0,0,23.5619,f,Unknown,no,no,United States,no,2.27662,Self,0
4,0,0,0,0,0,0,0,0,0,0,43.2058,m,Unknown,no,no,South Africa,no,-4.77729,Self,0


In [3]:
# Creating data back-up
df = data.copy()

# Data exploration
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 798 entries, 0 to 797
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         798 non-null    int64  
 1   A2_Score         798 non-null    int64  
 2   A3_Score         798 non-null    int64  
 3   A4_Score         798 non-null    int64  
 4   A5_Score         798 non-null    int64  
 5   A6_Score         798 non-null    int64  
 6   A7_Score         798 non-null    int64  
 7   A8_Score         798 non-null    int64  
 8   A9_Score         798 non-null    int64  
 9   A10_Score        798 non-null    int64  
 10  Age              798 non-null    float64
 11  Gender           798 non-null    object 
 12  Ethnicity        798 non-null    object 
 13  Jaundice         798 non-null    object 
 14  Autism           798 non-null    object 
 15  Country_of_res   798 non-null    object 
 16  Used_app_before  798 non-null    object 
 17  Result          

## Data Scaling

In [4]:
# Separating target column
X = df.drop(columns = ['Class/ASD'], axis = 1)
y = df['Class/ASD']

In [5]:
# Categorical features
cat_features = df.select_dtypes(include = ['object']).columns.to_list()
print("Categorical features are: ", cat_features)

Categorical features are:  ['Gender', 'Ethnicity', 'Jaundice', 'Autism', 'Country_of_res', 'Used_app_before', 'Relation']


In [6]:
# Numeric features which don't need scaling
num_features = df.select_dtypes(include = ['int']).columns.to_list()
print("Numeric features which are complete: ", num_features)

Numeric features which are complete:  ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'Class/ASD']


In [7]:
# Numeric features which need scaling
num_features_scale = df.select_dtypes(include = ['float']).columns.to_list()
print("Numeric features which need scaling: ", num_features_scale)

Numeric features which need scaling:  ['Age', 'Result']


In [8]:
# Scale columns
scaler = StandardScaler()
scaled_num = scaler.fit_transform(X[num_features_scale])
df_scaled_features = pd.DataFrame(scaled_num, columns = num_features_scale)
df_scaled_features.head()

Unnamed: 0,Age,Result
0,0.594888,-0.454135
1,1.181964,-1.30623
2,-1.292542,1.314206
3,-0.300688,-1.301772
4,0.903394,-2.769211


## Data Encoding

In [9]:
# Encoding categorical columns
df_cat_features = X[cat_features]
for col in cat_features:
    df_cat_features[col], t = pd.factorize(df[col])
df_cat_features.head()

Unnamed: 0,Gender,Ethnicity,Jaundice,Autism,Country_of_res,Used_app_before,Relation
0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0
2,1,1,0,1,2,0,0
3,0,0,0,0,2,0,0
4,1,0,0,0,3,0,0


In [10]:
# Combine dataframes
X = pd.concat([X[[f'A{i}_Score' for i in range(1,11)]].reset_index(drop = True), df_scaled_features.reset_index(drop = True), df_cat_features.reset_index(drop = True)], axis = 1)
X.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,Age,Result,Gender,Ethnicity,Jaundice,Autism,Country_of_res,Used_app_before,Relation
0,1,0,1,0,1,0,1,0,1,1,0.594888,-0.454135,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1.181964,-1.30623,1,0,0,0,1,0,0
2,1,1,1,1,1,1,1,1,1,1,-1.292542,1.314206,1,1,0,1,2,0,0
3,0,0,0,0,0,0,0,0,0,0,-0.300688,-1.301772,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,0.903394,-2.769211,1,0,0,0,3,0,0


## Model Building

In [11]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
# Define models
# Model list
model = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boost': GradientBoostingClassifier()
}

# Hyperparameters search params
search_params = {
    'fit_intercept': [True, False],
    'positive': [False, True],
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [200, 500, 1000],
    'class_weight': [None, 'balanced'],
    'l1_ratio': [0.1, 0.5, 0.9],
    'max_depth': [3, 5, 10, 15],
    'min_samples_split': [10, 20, 30, 50],
    'min_samples_leaf': [5, 10, 15, 20],
    'criterion': ['gini', 'entropy'],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

# Function to filter valid params for each model
def filter_hyperparameter(model, space):
    valid_keys = model.get_params().keys()
    param_grid = {k: v for k, v in space.items() if k in valid_keys}

    if isinstance(model, GradientBoostingClassifier):
        param_grid['criterion'] = ['friedman_mse', 'squared_error']
    return param_grid

In [13]:
# Run grid search for each model
results = []

for name, model in model.items():
    print(f"Tuning {name}...")

    param_grid = filter_hyperparameter(model, search_params)
    grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    best_params = grid.best_params_

    if(name != 'Linear Regression'):
        report = metrics.classification_report(y_test, y_pred, output_dict=True)
        print(f"Best Params for {name}: {best_params}")
        results.append({
            'Model_Name': name,
            'Best_Parameter': best_params,
            'Accuracy': f'{round(metrics.accuracy_score(y_test, y_pred), 4)*100}%',
            'F1_Score': f'{round(report['weighted avg']['f1-score'], 4)*100}%'
        })
    else:
        mse = metrics.mean_squared_error(y_test, y_pred)
        r2 = metrics.r2_score(y_test, y_pred)
        print(f"Best Params for {name}: {best_params}")
        results.append({
            'Model_Name': name,
            'Best_Parameter': best_params,
            'Mean Squared Error': f'{round(mse, 4)*100}%',
            'R-Squared Value': f'{round(r2, 4)*100}%'
        })

print("Tuning Process is complete\n")

Tuning Linear Regression...
Best Params for Linear Regression: {'fit_intercept': True, 'positive': False}
Tuning Logistic Regression...
Best Params for Logistic Regression: {'C': 0.1, 'class_weight': None, 'fit_intercept': True, 'l1_ratio': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Tuning Decision Tree...
Best Params for Decision Tree: {'class_weight': None, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 50}
Tuning Random Forest...
Best Params for Random Forest: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 50, 'n_estimators': 200}
Tuning Gradient Boost...
Best Params for Gradient Boost: {'criterion': 'squared_error', 'learning_rate': 0.05, 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 20, 'min_samples_split': 50, 'n_estimators': 100, 'subsample': 0.6}
Tuning Process is complete



In [14]:
for i in results:
    for k,v in i.items():
        print(f'{k}: {v}')
    if(i != results[-1]): print('='*35)

Model_Name: Linear Regression
Best_Parameter: {'fit_intercept': True, 'positive': False}
Mean Squared Error: 11.15%
R-Squared Value: 38.46%
Model_Name: Logistic Regression
Best_Parameter: {'C': 0.1, 'class_weight': None, 'fit_intercept': True, 'l1_ratio': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 86.25%
F1_Score: 85.83%
Model_Name: Decision Tree
Best_Parameter: {'class_weight': None, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 50}
Accuracy: 81.25%
F1_Score: 79.93%
Model_Name: Random Forest
Best_Parameter: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 50, 'n_estimators': 200}
Accuracy: 88.12%
F1_Score: 87.53999999999999%
Model_Name: Gradient Boost
Best_Parameter: {'criterion': 'squared_error', 'learning_rate': 0.05, 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 20, 'min_samples_split': 50, 'n_estimato