In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from pandas_profiling import ProfileReport

In [2]:
def model_pipeline(df_train_1, model, all_features, target ):
    X_= df_train_1[all_features]
    y_= df_train_1[target]

    X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.25, random_state=42)

    pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'), 
    model
    )

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_val)
    acc = accuracy_score(y_pred, y_val)

    return acc, pipeline

In [3]:
def create_submission_file(model, df_test, target, submission_file_name):
    y_test_pred = model.predict(df_test)
    res = pd.read_csv('sample_submission.csv')
    res[target] = y_test_pred
    res.to_csv(submission_file_name, index=False)

In [8]:
def complete_pipeline(df_train, df_test, model, all_features, target, submission_file_name):
    acc, pipeline_model = model_pipeline(df_train, model, all_features, target )
    print(acc)
    create_submission_file(pipeline_model, df_test[all_features], target, submission_file_name)

In [9]:
df_train = pd.read_csv("train.csv").set_index("ID")
df_test = pd.read_csv("test.csv").set_index("ID")
df_train.shape, df_test.shape

((800, 21), (200, 20))

In [25]:
all_features = ['A3_Score', 'A6_Score', 'A9_Score', 'A4_Score', 'result',
       'A5_Score', 'A10_Score', 'A7_Score', 'A2_Score', 'austim']
target = 'Class/ASD' 

In [26]:
complete_pipeline(df_train, df_test, LogisticRegression(), all_features, target, "trail_sub_3.csv")

0.86


In [12]:
from sklearn.svm import SVC
svm = SVC(random_state=42)

In [27]:
complete_pipeline(df_train, df_test, svm, all_features, target, "submission_svm_3.csv")

0.87


In [15]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()

In [28]:
complete_pipeline(df_train, df_test, dec_tree, all_features, target, "submission_dec_tree_3.csv")

0.85


In [17]:
from sklearn.ensemble import RandomForestClassifier
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [29]:
complete_pipeline(df_train, df_test, rand_forest , all_features, target, "submission_rand_forest_3.csv")

0.865


## Ideas:
1. Group by all the autism specturm
2. Try different encoders
3. normalization
4. Grouping based on contininent or level of country (health index)
 