In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
svm = SVC(random_state=42)

from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()

from sklearn.ensemble import RandomForestClassifier
rand_forest = RandomForestClassifier()

from pandas_profiling import ProfileReport

In [2]:
def model_pipeline(df_train_1, model, all_features, target ):
    X_= df_train_1[all_features]
    y_= df_train_1[target]

    X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.25, random_state=42)

    pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'), 
    model
    )

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_val)
    acc = accuracy_score(y_pred, y_val)

    return acc, pipeline

In [3]:
def create_submission_file(model, df_test, target, submission_file_name):
    y_test_pred = model.predict(df_test)
    res = pd.read_csv('sample_submission.csv')
    res[target] = y_test_pred
    res.to_csv(submission_file_name, index=False)

In [4]:
df_train = pd.read_csv("train.csv").set_index("ID")
df_test = pd.read_csv("test.csv").set_index("ID")
df_train.shape, df_test.shape

((800, 21), (200, 20))

In [5]:
aq_features = ['A'+str(i)+'_Score' for i in range(1,11)]
numerical_features = ['age', 'result']
categorical_features = ['gender', 'ethnicity', 'jaundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc', 'relation']
target = 'Class/ASD'

In [19]:
def aq_features_class(df):
    df_group = df.copy()
    for aq in aq_features:
        df_group[aq] = df_group[aq].astype('str')
    df_group['aq_features'] =  df_group['A1_Score'] + df_group['A2_Score'] + df_group['A3_Score'] + df_group['A4_Score'] + df_group['A5_Score'] + df_group['A6_Score'] + df_group['A7_Score'] + df_group['A8_Score'] + df_group['A9_Score'] + df_group['A10_Score']
    df_group['aq_features'] = df_group['aq_features'].astype('category')

    #df_group_1 =  df_group.drop(columns = aq_features)

    return df_group


In [20]:
df2_ = aq_features_class(df_train)

In [21]:
all_features = df2_.drop(columns=['Class/ASD']).columns
target = 'Class/ASD'

In [22]:
model_pipeline(df2_ , LogisticRegression(), all_features, target)

(0.87,
 Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                 ('logisticregression', LogisticRegression())]))

In [26]:
acc, model = model_pipeline(df2_ , svm, all_features, target)
acc

0.89

In [29]:
create_submission_file(model, aq_features_class(df_test), target, "iteration_4_svm.csv")

In [24]:
model_pipeline(df2_ , dec_tree, all_features, target)

(0.84,
 Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                 ('decisiontreeclassifier', DecisionTreeClassifier())]))

In [25]:
model_pipeline(df2_ ,rand_forest, all_features, target)

(0.87,
 Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                 ('randomforestclassifier', RandomForestClassifier())]))

## Ideas:
1. ~~Group by all the autism specturm~~
2. Try different encoders
3. normalization
4. Grouping based on contininent or level of country (health index)
 