In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

from pandas_profiling import ProfileReport

In [2]:
def aq_features_class(df, aq_features):
    df_group = df.copy()
    for aq in aq_features:
        df_group[aq] = df_group[aq].astype('str')
    df_group['aq_features'] =  df_group['A1_Score'] + df_group['A2_Score'] + df_group['A3_Score'] + df_group['A4_Score'] + df_group['A5_Score'] + df_group['A6_Score'] + df_group['A7_Score'] + df_group['A8_Score'] + df_group['A9_Score'] + df_group['A10_Score']
    df_group['aq_features'] = df_group['aq_features'].astype('category')

    #df_group_1 =  df_group.drop(columns = aq_features)

    return df_group


In [3]:
def model_pipeline(df_train_1, model, all_features, target ):
    X_= df_train_1[all_features]
    y_= df_train_1[target]

    X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.25, random_state=42)

    pipeline = make_pipeline(
    LabelEncoder(), 
    model
    )

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_val)
    acc = accuracy_score(y_pred, y_val)

    return acc, pipeline

In [4]:
def create_submission_file(model, df_test, target, submission_file_name):
    y_test_pred = model.predict(df_test)
    res = pd.read_csv('sample_submission.csv')
    res[target] = y_test_pred
    res.to_csv(submission_file_name, index=False)

In [5]:
df_train = pd.read_csv("train.csv").set_index("ID")
df_test = pd.read_csv("test.csv").set_index("ID")
df_train.shape, df_test.shape

((800, 21), (200, 20))

In [6]:
aq_features = ['A'+str(i)+'_Score' for i in range(1,11)]
numerical_features = ['age', 'result']
categorical_features = ['gender', 'ethnicity', 'jaundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc', 'relation']
target = 'Class/ASD'

In [7]:
df2_ = aq_features_class(df_train, aq_features)

In [8]:
all_features = df2_.drop(columns=['Class/ASD', 'age_desc', 'contry_of_res']).columns
target = 'Class/ASD'

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)

In [10]:
df_test_2 = aq_features_class(df_test, aq_features)

In [11]:
df_train_1 = df2_.copy()
X_= df_train_1[all_features]
y_= df_train_1[target]

X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.25, random_state=42)


In [12]:
categorical_features

['gender',
 'ethnicity',
 'jaundice',
 'austim',
 'contry_of_res',
 'used_app_before',
 'age_desc',
 'relation']

In [13]:
categorical_cols = categorical_features + ['aq_features']
df2_copy = df2_.copy()
for col in categorical_cols:
    le = LabelEncoder()
    df2_copy[col] = le.fit_transform(df2_copy[col])


In [14]:
def model_pipeline(df_train_1, pipeline, all_features, target ):
    X_= df_train_1[all_features]
    y_= df_train_1[target]

    X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.25, random_state=42)


    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_val)
    acc = accuracy_score(y_pred, y_val)

    return acc, pipeline

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
for i in range(1, 16):
    acc, model = model_pipeline(df2_copy, KNeighborsClassifier(n_neighbors=i), all_features, target)
    print(acc)

0.83
0.87
0.855
0.865
0.855
0.86
0.84
0.865
0.85
0.855
0.855
0.845
0.85
0.845
0.845


In [17]:
model_pipeline(df2_copy, LogisticRegression(), all_features, target)

(0.86, LogisticRegression())

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
model_pipeline(df2_copy, DecisionTreeClassifier() , all_features, target)

(0.835, DecisionTreeClassifier())

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
model_pipeline(df2_copy, RandomForestClassifier() , all_features, target)

(0.86, RandomForestClassifier())

In [22]:
rf = RandomForestClassifier()
rf.fit(df2_copy[all_features], df2_copy[target])

RandomForestClassifier()

In [24]:
df_test_3 = df_test_2.copy()
for col in categorical_cols:
    le = LabelEncoder()
    df_test_3[col] = le.fit_transform(df_test_3[col])

In [25]:
y_test_predict = rf.predict(df_test_3[all_features])
y_test_predict

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0], dtype=int64)

In [26]:
res = pd.read_csv('sample_submission.csv')
res[target] = y_test_predict
res.to_csv("submission-iteration-6.csv", index = False)
res.head()

Unnamed: 0,ID,Class/ASD
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


Each catergory - check for repeated values

model_lr = LogisticRegression(solver='saga', 
                              random_state=0,
                              C=0.22685190926977272,tol=1e-5, max_iter=10000,
                              penalty='l2',class_weight='balanced')
model_etc = ExtraTreesClassifier(
    n_estimators=1000,
    max_depth=2,
    random_state=0,
)
model_lr.fit(X_train_label,y_train)
model_etc.fit(X_train_label,y_train)
p_lr=model_lr.predict_proba(X_test_label)[:, 1]
p_etc=model_etc.predict_proba(X_test_label)[:, 1]
p=p_lr*0.9+p_etc*0.1