In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = [10,5]

In [2]:
df_train = pd.read_csv("train.csv").set_index('ID')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 800
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         800 non-null    int64  
 1   A2_Score         800 non-null    int64  
 2   A3_Score         800 non-null    int64  
 3   A4_Score         800 non-null    int64  
 4   A5_Score         800 non-null    int64  
 5   A6_Score         800 non-null    int64  
 6   A7_Score         800 non-null    int64  
 7   A8_Score         800 non-null    int64  
 8   A9_Score         800 non-null    int64  
 9   A10_Score        800 non-null    int64  
 10  age              800 non-null    float64
 11  gender           800 non-null    object 
 12  ethnicity        800 non-null    object 
 13  jaundice         800 non-null    object 
 14  austim           800 non-null    object 
 15  contry_of_res    800 non-null    object 
 16  used_app_before  800 non-null    object 
 17  result          

In [4]:
df_train_1 = df_train.copy()
df_train_1['age'].describe()

count    800.000000
mean      28.612306
std       12.872373
min        9.560505
25%       19.282082
50%       25.479960
75%       33.154755
max       72.402488
Name: age, dtype: float64

In [5]:
df_train_1.head()

Unnamed: 0_level_0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,1,1,1,0,1,1,1,...,f,White-European,no,no,United States,no,7.819715,18 and more,Self,0
2,0,0,0,0,0,0,0,0,0,1,...,f,South Asian,no,no,Australia,no,10.544296,18 and more,?,0
3,1,1,1,1,1,1,0,0,1,1,...,f,White-European,no,no,United Kingdom,no,13.167506,18 and more,Self,1
4,0,0,0,1,0,0,0,0,0,0,...,f,South Asian,no,no,New Zealand,no,1.530098,18 and more,?,0
5,0,0,0,0,1,0,0,0,1,1,...,m,Black,no,yes,Italy,no,7.949723,18 and more,Self,0


In [6]:
aq_features = ['A'+str(i)+'_Score' for i in range(1,11)]
numerical_features = ['age', 'result']
categorical_features = ['gender', 'ethnicity', 'jaundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc', 'relation']
target = 'Class/ASD'

In [7]:
for col in categorical_features:
    print(col, df_train_1[col].unique())

gender ['f' 'm']
ethnicity ['White-European' 'South Asian' 'Black' 'Asian' 'Middle Eastern ' '?'
 'others' 'Latino' 'Turkish' 'Others' 'Hispanic' 'Pasifika']
jaundice ['no' 'yes']
austim ['no' 'yes']
contry_of_res ['United States' 'Australia' 'United Kingdom' 'New Zealand' 'Italy'
 'Nicaragua' 'Canada' 'United Arab Emirates' 'Netherlands' 'Sri Lanka'
 'India' 'Armenia' 'Sierra Leone' 'Argentina' 'Azerbaijan' 'Iceland'
 'Egypt' 'Serbia' 'Afghanistan' 'Costa Rica' 'Jordan' 'Angola' 'Pakistan'
 'Brazil' 'Ireland' 'Kazakhstan' 'Viet Nam' 'Ethiopia' 'Austria' 'Finland'
 'France' 'Malaysia' 'Japan' 'Spain' 'Philippines' 'Iran' 'Czech Republic'
 'Russia' 'Romania' 'Mexico' 'Belgium' 'Aruba' 'Uruguay' 'Indonesia'
 'Ukraine' 'AmericanSamoa' 'Germany' 'China' 'Iraq' 'Tonga' 'South Africa'
 'Saudi Arabia' 'Hong Kong' 'Bahamas' 'Ecuador' 'Cyprus' 'Bangladesh'
 'Oman' 'Bolivia' 'Sweden' 'Niger']
used_app_before ['no' 'yes']
age_desc ['18 and more']
relation ['Self' '?' 'Health care professional' 'P

In [8]:
df_train_1['age_desc'].nunique()

1

drop age_desc

In [9]:
df_train_2 = df_train_1.drop(columns = ['age_desc'])
for col in categorical_features:
    if(col != 'age_desc'):
        df_train_2[col] = df_train_2[col].astype('category')
        df_train_2[col] = df_train_2[col].cat.codes

In [10]:
corr_map = df_train_2.corr()['Class/ASD'].sort_values(ascending=False)

In [11]:
corr_map

Class/ASD          1.000000
A3_Score           0.549618
A6_Score           0.542588
A9_Score           0.538688
A4_Score           0.512792
result             0.496165
A5_Score           0.448473
A10_Score          0.439587
A7_Score           0.394660
A2_Score           0.380703
austim             0.377079
ethnicity          0.324323
A1_Score           0.289991
A8_Score           0.219034
jaundice           0.190769
relation           0.152808
age                0.146376
contry_of_res      0.110710
used_app_before   -0.015853
gender            -0.124789
Name: Class/ASD, dtype: float64

In [12]:
corr_map[corr_map>0.35].index

Index(['Class/ASD', 'A3_Score', 'A6_Score', 'A9_Score', 'A4_Score', 'result',
       'A5_Score', 'A10_Score', 'A7_Score', 'A2_Score', 'austim'],
      dtype='object')

In [16]:
df_group = df_train.copy()
for aq in aq_features:
    df_group[aq] = df_group[aq].astype('str')
df_group['aq_features'] =  df_group['A1_Score'] + df_group['A2_Score'] + df_group['A3_Score'] + df_group['A4_Score'] + df_group['A5_Score'] + df_group['A6_Score'] + df_group['A7_Score'] + df_group['A8_Score'] + df_group['A9_Score'] + df_group['A10_Score']

In [20]:
df_group_1 =  df_group.drop(columns = aq_features)

In [21]:
df_group_1.columns

Index(['age', 'gender', 'ethnicity', 'jaundice', 'austim', 'contry_of_res',
       'used_app_before', 'result', 'age_desc', 'relation', 'Class/ASD',
       'aq_features'],
      dtype='object')

In [22]:
df_group_1['aq_features'] = df_group_1['aq_features'].astype('category')