In [294]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [295]:
dataset = pd.read_csv('data/autism.csv')

In [296]:
dataset.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,'United States',no,6,'18 and more',Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5,'18 and more',Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8,'18 and more',Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,'United States',no,6,'18 and more',Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,?,no,no,Egypt,no,2,'18 and more',?,NO


In [297]:
data_x = dataset.drop('Class/ASD', 1)
data_y = dataset['Class/ASD']

In [298]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder # For pipe
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Feature Engineering

In [299]:
# Step 1: Missing Values
# Step 2: Fill Missing Values
# Step 3: Check Datatypes
# Step 4: Encoding Attributes
# Step 5: Standard Scaling

In [300]:
data_x[data_x == '?'] = np.nan

  res_values = method(rvalues)


In [301]:
data_x.isnull().sum()

A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 2
gender              0
ethnicity          95
jundice             0
austim              0
contry_of_res       0
used_app_before     0
result              0
age_desc            0
relation           95
dtype: int64

In [302]:
imputer = SimpleImputer(strategy = 'most_frequent')

In [303]:
data_x['relation'] = imputer.fit_transform(data_x[['relation']])
data_x['ethnicity'] = imputer.fit_transform(data_x[['ethnicity']])
data_x['age'] = imputer.fit_transform(data_x[['age']])

In [304]:
data_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   A1_Score         704 non-null    int64 
 1   A2_Score         704 non-null    int64 
 2   A3_Score         704 non-null    int64 
 3   A4_Score         704 non-null    int64 
 4   A5_Score         704 non-null    int64 
 5   A6_Score         704 non-null    int64 
 6   A7_Score         704 non-null    int64 
 7   A8_Score         704 non-null    int64 
 8   A9_Score         704 non-null    int64 
 9   A10_Score        704 non-null    int64 
 10  age              704 non-null    object
 11  gender           704 non-null    object
 12  ethnicity        704 non-null    object
 13  jundice          704 non-null    object
 14  austim           704 non-null    object
 15  contry_of_res    704 non-null    object
 16  used_app_before  704 non-null    object
 17  result           704 non-null    in

In [305]:
# Age should be INT not obj

In [306]:
data_x = data_x.astype({'age': 'int64'})

In [307]:
data_x.dtypes

A1_Score            int64
A2_Score            int64
A3_Score            int64
A4_Score            int64
A5_Score            int64
A6_Score            int64
A7_Score            int64
A8_Score            int64
A9_Score            int64
A10_Score           int64
age                 int64
gender             object
ethnicity          object
jundice            object
austim             object
contry_of_res      object
used_app_before    object
result              int64
age_desc           object
relation           object
dtype: object

In [308]:
data_x_cat = data_x.select_dtypes(['object'])
data_x_num = data_x.select_dtypes(['int64'])

In [309]:
data_x_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   A1_Score   704 non-null    int64
 1   A2_Score   704 non-null    int64
 2   A3_Score   704 non-null    int64
 3   A4_Score   704 non-null    int64
 4   A5_Score   704 non-null    int64
 5   A6_Score   704 non-null    int64
 6   A7_Score   704 non-null    int64
 7   A8_Score   704 non-null    int64
 8   A9_Score   704 non-null    int64
 9   A10_Score  704 non-null    int64
 10  age        704 non-null    int64
 11  result     704 non-null    int64
dtypes: int64(12)
memory usage: 66.1 KB


In [310]:
data_x_cat.head()

Unnamed: 0,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,age_desc,relation
0,f,White-European,no,no,'United States',no,'18 and more',Self
1,m,Latino,no,yes,Brazil,no,'18 and more',Self
2,m,Latino,yes,yes,Spain,no,'18 and more',Parent
3,f,White-European,no,yes,'United States',no,'18 and more',Self
4,f,White-European,no,no,Egypt,no,'18 and more',Self


In [311]:
enc = LabelEncoder()

In [312]:
data_x_cat['jundice'] = enc.fit_transform(data_x_cat['jundice'])
data_x_cat['austim'] = enc.fit_transform(data_x_cat['austim'])
data_x_cat['used_app_before'] = enc.fit_transform(data_x_cat['used_app_before'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [313]:
pd.get_dummies(data_x_cat['relation'], drop_first = True)

Unnamed: 0,Others,Parent,Relative,Self
0,0,0,0,1
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
699,0,0,0,1
700,0,1,0,0
701,0,0,0,1
702,0,0,0,1


In [314]:
gender = pd.get_dummies(data_x_cat['gender'], drop_first = True)
ethnicity = pd.get_dummies(data_x_cat['ethnicity'], drop_first = True)
contry_of_res = pd.get_dummies(data_x_cat['contry_of_res'], drop_first = True)
age_desc = pd.get_dummies(data_x_cat['age_desc'], drop_first = True)
relation = pd.get_dummies(data_x_cat['relation'], drop_first = True)

In [315]:
relation = relation.rename(columns={'Others': 'other_rel'})

In [316]:
ethnicity

Unnamed: 0,'South Asian',Asian,Black,Hispanic,Latino,Others,Pasifika,Turkish,White-European,others
0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
699,0,0,0,0,0,0,0,0,1,0
700,0,0,0,1,0,0,0,0,0,0
701,0,0,0,0,0,0,0,0,1,0
702,1,0,0,0,0,0,0,0,0,0


In [317]:
relation

Unnamed: 0,other_rel,Parent,Relative,Self
0,0,0,0,1
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
699,0,0,0,1
700,0,1,0,0
701,0,0,0,1
702,0,0,0,1


In [318]:
data_x_cat = data_x_cat.join([gender,ethnicity,contry_of_res,age_desc,relation])

In [319]:
data_x_cat = data_x_cat.drop(['gender', 'ethnicity', 'contry_of_res', 'age_desc', 'relation'], 1)

In [320]:
data_x_cat.head()

Unnamed: 0,jundice,austim,used_app_before,m,'South Asian',Asian,Black,Hispanic,Latino,Others,...,Spain,Sweden,Tonga,Turkey,Ukraine,Uruguay,other_rel,Parent,Relative,Self
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,1,1,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [321]:
data_x = data_x_num.join([data_x_cat])

In [322]:
scaler = StandardScaler()

In [323]:
data_x = pd.DataFrame(scaler.fit_transform(data_x), columns = data_x.columns)

In [324]:
data_x.head() #Done

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,Spain,Sweden,Tonga,Turkey,Ukraine,Uruguay,other_rel,Parent,Relative,Self
0,0.62115,1.098588,1.08919,1.008559,-0.997163,-0.629941,1.180914,0.735174,-0.692092,-1.16046,...,-0.065419,-0.053376,-0.037716,-0.037716,-0.053376,-0.037716,-0.084576,-0.276501,-0.203519,0.375506
1,0.62115,1.098588,-0.918113,1.008559,-0.997163,-0.629941,-0.846802,0.735174,-0.692092,0.861727,...,-0.065419,-0.053376,-0.037716,-0.037716,-0.053376,-0.037716,-0.084576,-0.276501,-0.203519,0.375506
2,0.62115,1.098588,-0.918113,1.008559,1.002845,-0.629941,1.180914,0.735174,1.444894,0.861727,...,15.286159,-0.053376,-0.037716,-0.037716,-0.053376,-0.037716,-0.084576,3.616628,-0.203519,-2.663072
3,0.62115,1.098588,-0.918113,1.008559,-0.997163,-0.629941,1.180914,0.735174,-0.692092,0.861727,...,-0.065419,-0.053376,-0.037716,-0.037716,-0.053376,-0.037716,-0.084576,-0.276501,-0.203519,0.375506
4,0.62115,-0.910259,-0.918113,-0.991513,-0.997163,-0.629941,-0.846802,0.735174,-0.692092,-1.16046,...,-0.065419,-0.053376,-0.037716,-0.037716,-0.053376,-0.037716,-0.084576,-0.276501,-0.203519,0.375506


In [330]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

# Test & Train

In [331]:
train_x,test_x,train_y,test_y = train_test_split(data_x, data_y, test_size = 0.25, random_state = 44)

In [332]:
class habab_classification_branch:
    def __init__(self, x, y, x_, y_, dict_=False):
        self.x_test = x
        self.y_test = y
        self.x_train = x_
        self.y_train = y_
        self.dict_ins = {}
        self.dict = dict_
        
    def LOG(self):
        model = LogisticRegression().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'Logistic Score: {model}')
        else:
            self.dict_ins['LOG'] = str(model)
    
    def GRAD(self):
        model = SGDClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'SGDClassifier: {model}')
        else:
            self.dict_ins['GRAD'] = str(model)
            
    def VECTOR(self):
        model = SVC().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'SVC: {model}')
        else:
            self.dict_ins['SVC'] = str(model)
    
    def KN(self):
        model = KNeighborsClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'KNeighborsClassifier: {model}')
        else:
            self.dict_ins['KN'] = str(model)
    
    def ense(self):
        model = RandomForestClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'RandomForestClassifier: {model}')
        else:
            self.dict_ins['FOREST'] = str(model)
    
    def tree(self):        
        model = DecisionTreeClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'DecisionTreeClassifier: {model}')
        else:
            self.dict_ins['TREE'] = str(model)
    
    def bayes(self):
        model = GaussianNB().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'GaussianNB: {model}')
        else:
            self.dict_ins['NB'] = str(model)
    
    def check_return(self):
        if self.dict_ins == False:
            pass
        else:
            return self.dict_ins
    
    def start(self):
        print("""Staring Classification Training & Testing""")
        print()
        self.LOG()
        self.GRAD()
        self.VECTOR()
        self.KN()
        self.ense()
        self.tree()
        self.bayes()
        if self.dict_ins == False:
            pass
        else:
            return self.dict_ins

In [333]:
habab_classification_branch(test_x, test_y, train_x, train_y).start()

Staring Classification Training & Testing

Logistic Score: 0.9772727272727273
SGDClassifier: 0.9772727272727273
SVC: 0.9261363636363636
KNeighborsClassifier: 0.8806818181818182
RandomForestClassifier: 1.0
DecisionTreeClassifier: 1.0
GaussianNB: 0.3125


{}

In [334]:
Logestic = LogisticRegression().fit(tra)

In [None]:
Logestic.predict(test_x)