In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from pandas_profiling import ProfileReport

In [7]:
df_train = pd.read_csv("train.csv").set_index("ID")
df_test = pd.read_csv("test.csv").set_index("ID")
df_train.shape, df_test.shape

((800, 21), (200, 20))

In [8]:
def model_pipeline(df_train_1, model, all_features, target ):
    X_= df_train_1[all_features]
    y_= df_train_1[target]

    X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.25, random_state=42)

    pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'), 
    model
    )

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_val)
    acc = accuracy_score(y_pred, y_val)

    return acc, pipeline

In [9]:
df_train_1 = df_train.copy()

In [10]:
all_features = df_train_1.drop(columns = ['Class/ASD']).columns
target = 'Class/ASD'

In [15]:
def create_submission_file(model, df_test, target, submission_file_name):
    y_test_pred = model.predict(df_test)
    res = pd.read_csv('sample_submission.csv')
    res[target] = y_test_pred
    res.to_csv(submission_file_name, index=False)

In [16]:
def complete_pipeline(df_train, df_test, model, all_features, target, submission_file_name):
    acc, pipeline_model = model_pipeline(df_train_1, model, all_features, target )
    print(acc)
    create_submission_file(pipeline_model, df_test, target, submission_file_name)

In [17]:
complete_pipeline(df_train_1, df_test, LogisticRegression(), all_features, target, "trail_sub.csv")

0.87


In [18]:
from sklearn.svm import SVC
svm = SVC(random_state=42)

In [24]:
complete_pipeline(df_train_1, df_test, svm, all_features, target, "submission_svm.csv")

0.88


In [25]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()

In [26]:
complete_pipeline(df_train_1, df_test, dec_tree, all_features, target, "submission_dec_tree.csv")

0.855


In [27]:
from sklearn.ensemble import RandomForestClassifier
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [28]:
complete_pipeline(df_train_1, df_test, rand_forest , all_features, target, "submission_rand_forest.csv")

0.885


## Ideas:
1. Group by all the autism specturm
2. Try different encoders
3. normalization