In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dt_f = pd.read_csv('/content/drive/My Drive/Dataset/Adult-Adolescents-Child Datasets/Autism-Adult-Data.csv')

In [None]:
dt_f

In [None]:
dt_f.info()

In [None]:
dt_f.dtypes

In [None]:
dt_f.head(10)

In [None]:
dt_f.describe()

# **Data Cleaning**

**Checking Missing Values**

In [None]:
dt_f.apply(lambda x: sum(x.isnull()))

In [None]:
categorical_val = []
continous_val = []
for column in dt_f.columns:
    print('==============================')
    print(f"{column} : {dt_f[column].unique()}")
    if len(dt_f[column].unique()) <= 10:
        categorical_val.append(column)
    else:
        continous_val.append(column)

In [None]:
dt_f.nunique()

In [None]:
dt_f['Class/ASD'].value_counts()

In [None]:
dt_f.drop('age_desc',axis=1,inplace=True)
dt_f.drop('id', axis=1, inplace=True)
dt_f.head()

In [None]:
dt_f = dt_f.rename(columns={'austim':'family member with PDD','jundice':'jaundice','contry_of_res':'country_of_res'})

In [None]:
dt_f['age'] = dt_f['age'].replace(['?'],'NaN')
dt_f['gender'] = dt_f['gender'].map({'m':1,'f':0})
dt_f['jaundice'] = dt_f['jaundice'].map({'yes':1,'no':0})
dt_f['family member with PDD'] = dt_f['family member with PDD'].map({'yes':1,'no':0})
dt_f['used_app_before'] = dt_f['used_app_before'].map({'yes':1,'no':0})
dt_f['Class/ASD'] = dt_f['Class/ASD'].map({'YES':1,'NO':0})

In [None]:
dt_f.info()

In [None]:
dt_f['ethnicity'] = dt_f['ethnicity'].apply(lambda x: 'Others' if x == '?' else x)
dt_f['ethnicity'] = dt_f['ethnicity'].apply(lambda x: 'Others' if x == 'others' else x)
dt_f['ethnicity'] = dt_f['ethnicity'].apply(lambda x: 'Hispanic' if x == 'Latino' else x)

In [None]:
dt_f['relation'] = dt_f['relation'].apply(lambda x: 'Caregiver' if x == '?' else x)

In [None]:
dt_f.info()

In [None]:
dt_f['ethnicity'].value_counts()

**Exploratory Data Analysis**
1. Getting insights about the dataset
2. Handling missing values
3. Data visualization
4. Handling outliers

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(dt_f.corr(),annot=True)
plt.title('Heatmap of Variable Correlations')
plt.show()

ASD correlates with A9, A6, A5 and A4 scores. Also corelates highly with result which is expected.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='gender',y='Class/ASD',data=dt_f)
plt.title('data Spectrum Disorder vs Gender')
plt.xlabel('Gender')
plt.ylabel('data Spectrum Disorder')
plt.show()

In [None]:
sns.countplot(x ='gender', hue = "Class/ASD", data = dt_f)

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x='ethnicity',y='Class/ASD',data=dt_f)
plt.title('Autism Spectrum Disorder vs Ethnicity')
plt.xlabel('Ethnicity')
plt.ylabel('Autism Spectrum Disorder')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='jaundice',y='Class/ASD',data=dt_f)
plt.title('Autism Spectrum Disorder vs Jaundice')
plt.xlabel('Jaundice')
plt.ylabel('Autism Spectrum Disorder')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x='family member with PDD',y='Class/ASD',data=dt_f)
plt.title('Autism Spectrum Disorder vs Family member with PDD')
plt.xlabel('Family member with PDD')
plt.ylabel('Autism Spectrum Disorder')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x='relation',y='Class/ASD',data=dt_f)
plt.title('Autism Spectrum Disorder vs Relation to the Examinee')
plt.xlabel('Relation to the Examinee')
plt.ylabel('Autism Spectrum Disorder')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='Class/ASD',y='age',data=dt_f)
plt.title('Autism Spectrum Disorder vs Age')
plt.ylabel('Age')
plt.xlabel('Autism Spectrum Disorder')
plt.show()

In [None]:
pd.set_option('display.max_rows', 100)
dt_f['country_of_res'].value_counts()

In [None]:
Autism_country_of_res = pd.DataFrame(dt_f['country_of_res'].value_counts())
lis = list(Autism_country_of_res[Autism_country_of_res.country_of_res < 10].index)
dt_f['country_of_res'] = dt_f['country_of_res'].apply(lambda x: 'Others' if x in lis else x)
dt_f['country_of_res'].value_counts()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x='Class/ASD',y='country_of_res',data=dt_f)
plt.title('Autism Spectrum Disorder vs Country')
plt.ylabel('Country')
plt.xlabel('Autism Spectrum Disorder')
plt.show()

#Model building
We will build a model to predict the a case of ASD or not based on the features provided.

In [None]:
dt_f.info()

In [None]:
temp = pd.get_dummies(dt_f['ethnicity'],drop_first=False)
dt_f = pd.concat([dt_f,temp],axis=1)
dt_f.drop('ethnicity',axis=1,inplace=True)

temp = pd.get_dummies(dt_f['country_of_res'],drop_first=False)
dt_f = pd.concat([dt_f,temp],axis=1)
dt_f.drop('country_of_res',axis=1,inplace=True)

temp = pd.get_dummies(dt_f['relation'],drop_first=False)
dt_f = pd.concat([dt_f,temp],axis=1)
dt_f.drop('relation',axis=1,inplace=True)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
imputer = IterativeImputer()
col = dt_f.columns
dt_f = imputer.fit_transform(dt_f)
dt_f = pd.DataFrame(dt_f,columns=col)

In [None]:
dt_f.head()

In [None]:
dt_f.age.value_counts()
dt_f.age = dt_f.age.round(decimals=0)
dt_f.age.value_counts()

In [None]:
dt_f['Class/ASD'].value_counts()

Dropping the result column as it is an amalgamation of the A1 to 10 columns.

In [None]:
X = dt_f.drop(['Class/ASD','result'],axis=1)
y = dt_f['Class/ASD']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.80,random_state=True)
X_train.head()

In [None]:
X_test.head()

In [None]:
print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model1 = MLPClassifier(alpha=1, max_iter=300)
model1.fit(X_train.astype(float), y_train.astype(float))
cross_val_score(model1, X_train, y_train, cv=4, scoring='accuracy')

In [None]:
model2 = DecisionTreeClassifier(max_depth=5)
model2.fit(X_train.astype(float), y_train.astype(float))
cross_val_score(model2, X_train, y_train, cv=4, scoring='accuracy')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
preds1 = model1.predict(X_test)
print(f"Accuracy on train data by MLP Classifier: {accuracy_score(y_train, model1.predict(X_train))*100}")

print(f"Accuracy on test data by MLP Classifier: {accuracy_score(y_test, preds1)*100}")

cf_matrix = confusion_matrix(y_test, preds1)
plt.figure(figsize=(10,10))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for MLP Classifier on Test Data")
plt.show()
#Showing Precesion, Recall and F1 score
print(classification_report(y_test, preds1))

In [None]:
preds1

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

#calculate RMSE
sqrt(mean_squared_error(y_test, preds1))

In [None]:
preds3 = model2.predict(X_test)
print(f"Accuracy on train data by Decision Tree Classifier: {accuracy_score(y_train, model1.predict(X_train))*100}")

print(f"Accuracy on test data by Decision Tree Classifier: {accuracy_score(y_test, preds3)*100}")

cf_matrix = confusion_matrix(y_test, preds3)
plt.figure(figsize=(10,10))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for MLP Classifier on Test Data")
plt.show()
#Showing Precesion, Recall and F1 score
print(classification_report(y_test, preds3))

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

#calculate RMSE
sqrt(mean_squared_error(y_test, preds3))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
y_pred_abc = abc.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_abc)
print (cm)
acc_abc = accuracy_score(y_test, y_pred_abc)
print (acc_abc)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model3 =  LogisticRegression()
model3.fit(X_train.astype(float), y_train.astype(float))
cross_val_score(model3, X_train, y_train, cv=4, scoring='accuracy')

In [None]:
preds2 = model3.predict(X_test)
print(f"Accuracy on train data by Logistic Regression: {accuracy_score(y_train, model3.predict(X_train))*100}")

print(f"Accuracy on test data by Logistic Regression: {accuracy_score(y_test, preds2)*100}")

cf_matrix = confusion_matrix(y_test, preds2)
plt.figure(figsize=(10,10))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for Logistic Regression on Test Data")
plt.show()
#Showing Precesion, Recall and F1 score
print(classification_report(y_test, preds2))

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

#calculate RMSE
sqrt(mean_squared_error(y_test, preds2))

Train the model means create the model.

Test the model means test the accuracy of the model.