In [542]:
import os
import numpy as np  # linear algebra
import pandas as pd  #
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
%matplotlib inline

np.random.seed(7)

In [907]:
cols_remove = ['hours_worked_each_week', 'full_name']
label_features = ['owns_a_pet',\
                  'has_health_insurance','has_cancer','smokes','has_alzheimers']

In [988]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
ss = StandardScaler()
le = LabelEncoder()

def preprocess(df, train = True):
    df = df.drop(cols_remove, axis = 1)
    df = create_features(df)
    num_cols = get_numeric_cols(df)
#     df1 = pd.get_dummies(df[label_features], drop_first=True)
    
#     df = pd.concat([df, df1], axis=1)
    
    if train:
        df[num_cols] =  ss.fit_transform(df[num_cols])
    else:
        df[num_cols] =  ss.transform(df[num_cols])
    return df

def create_features(df):
    df['BMI'] = df['weight'] / df['height'] ** 2
    df['daily_person_water_usage'] = df['daily_household_water_usage'] / df['family_members_count']
    df['monthly_person_water_usage'] = df['daily_person_water_usage'] * 30
#     df['MIX'] = df['height'] * df['weight'] / df['year_of_birth'] 
    df['MIX'] = df['fast_food_meals_per_month'] + df['avg_sleep_hours'] + df['daily_household_water_usage']
    return df

def label_encoding(df, train = True):
    if train:
        for feat in label_features:
            df[feat+ '_new'] = le.fit_transform(df[feat])
    else:
        for feat in label_features:
            df[feat+ '_new'] = le.transform(df[feat])
    return df

In [989]:
from sklearn.model_selection import KFold


def test_mean_target_encoding(train, test, target, categorical, alpha=5):
    # Calculate global mean on the train data
    global_mean = train[target].mean()
    
    # Group by the categorical feature and calculate its properties
    train_groups = train.groupby(categorical)
    category_sum = train_groups[target].sum()
    category_size = train_groups.size()
    
    # Calculate smoothed mean target statistics
    train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
    
    # Apply statistics to the test data and fill new categories
    test_feature = test[categorical].map(train_statistics).fillna(global_mean)
    return test_feature.values


def train_mean_target_encoding(train, target, categorical, alpha=5):
    # Create 5-fold cross-validation
    kf = KFold(n_splits=5, random_state=123, shuffle=True)
    train_feature = pd.Series(index=train.index)
    
    # For each folds split
    for train_index, test_index in kf.split(train):
        cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
      
        # Calculate out-of-fold statistics and apply to cv_test
        cv_test_feature = test_mean_target_encoding(cv_train, cv_test, target, categorical, alpha)
        
        # Save new feature for this particular fold
        train_feature.iloc[test_index] = cv_test_feature       
    return train_feature.values


def mean_target_encoding(train, test, target, categorical, alpha=5):
  
    # Get test feature
    test_feature = test_mean_target_encoding(train, test, target, categorical, alpha)
    
    # Get train feature
    train_feature = train_mean_target_encoding(train, target, categorical, alpha)
    
    # Return new features to add to the model
    return train_feature, test_feature
    

In [990]:
def get_numeric_cols(df):
    num_cols = df._get_numeric_data().columns
    num_cols = [x for x in num_cols if x != 'infected']
    return num_cols
def get_cat_cols(df):
    cat_cols = df.select_dtypes(include=['object']).columns
    cat_cols = [x for x in cat_cols if x not in cols_remove]
    return cat_cols
    

In [991]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,gender,full_name,year_of_birth,weight,height,eye_color,race,accommodation_type,education_status,blood_type,...,owns_a_pet,has_health_insurance,has_cancer,smokes,has_alzheimers,facial_hair,daily_household_water_usage,diet_type,fast_food_meals_per_month,infected
0,male,Gerald Valentine,2024,8.69,52.32,brown,black,apartment,not_applicable,A+,...,no,no,no,no,no,none,1471.95,regular,2,0
1,male,Chad Sell,1956,93.21,157.04,brown,asian,villa,associate_degree,AB+,...,no,no,no,no,no,short,1969.39,regular,0,0
2,male,Alex Crawford,1949,83.79,187.7,brown,white,apartment,associate_degree,B+,...,yes,no,no,no,yes,none,366.43,regular,16,0
3,female,Mildred Valentine,1992,90.29,159.2,brown,middle_eastern,apartment,high_school,O+,...,yes,yes,no,no,no,none,1548.2,regular,1,1
4,male,Francisco Hill,2013,40.23,96.61,brown,asian,apartment,not_applicable,O-,...,yes,yes,no,no,no,long,361.75,regular,12,0


In [992]:
test = pd.read_csv('test.csv')

In [993]:
train_n = preprocess(train)
test_n = preprocess(test, False)
train_n.head()

Unnamed: 0,gender,year_of_birth,weight,height,eye_color,race,accommodation_type,education_status,blood_type,family_members_count,...,facial_hair,daily_household_water_usage,diet_type,fast_food_meals_per_month,infected,BMI,daily_person_water_usage,monthly_person_water_usage,MIX,MIX2
0,male,1.606812,-2.234327,-2.266887,brown,black,apartment,not_applicable,A+,-0.408602,...,none,-0.294115,regular,-0.73182,0,0.091945,0.683344,-0.14959,-1.962097,-0.296662
1,male,-1.277283,1.039737,0.115224,brown,asian,villa,associate_degree,AB+,-0.081557,...,short,0.169637,regular,-0.969178,0,0.591634,1.324677,-0.536741,0.69393,0.159574
2,male,-1.574175,0.674833,0.81266,brown,white,apartment,associate_degree,B+,-1.389736,...,none,-1.324764,regular,0.929687,0,-0.565731,0.644763,0.676505,0.908341,-1.317861
3,female,0.249591,0.926625,0.164358,brown,middle_eastern,apartment,high_school,O+,-0.081557,...,none,-0.223029,regular,-0.850499,1,0.412343,-0.761981,-0.922938,0.596038,-0.229313
4,male,1.140267,-1.012557,-1.259403,brown,asian,apartment,not_applicable,O-,-1.389736,...,long,-1.329127,regular,0.454971,0,1.029972,0.528835,-0.975679,-1.337822,-1.32502


In [994]:
for col in get_cat_cols(train):
    train_n[col], test_n[col] = mean_target_encoding(train=train_n,
                                                   test=test_n,
                                                   target='infected',
                                                   categorical=col,
                                                   alpha=11)



In [995]:
train_n.head()

Unnamed: 0,gender,year_of_birth,weight,height,eye_color,race,accommodation_type,education_status,blood_type,family_members_count,...,facial_hair,daily_household_water_usage,diet_type,fast_food_meals_per_month,infected,BMI,daily_person_water_usage,monthly_person_water_usage,MIX,MIX2
0,0.335884,1.606812,-2.234327,-2.266887,0.345304,0.349177,0.332071,0.314288,0.335703,-0.408602,...,0.374603,-0.294115,0.334884,-0.73182,0,0.091945,0.683344,-0.14959,-1.962097,-0.296662
1,0.335884,-1.277283,1.039737,0.115224,0.345304,0.38312,0.415384,0.310793,0.341245,-0.081557,...,0.30425,0.169637,0.334884,-0.969178,0,0.591634,1.324677,-0.536741,0.69393,0.159574
2,0.332179,-1.574175,0.674833,0.81266,0.339919,0.332158,0.322268,0.319552,0.420923,-1.389736,...,0.370065,-1.324764,0.332431,0.929687,0,-0.565731,0.644763,0.676505,0.908341,-1.317861
3,0.363372,0.249591,0.926625,0.164358,0.339919,0.25884,0.322268,0.391562,0.338615,-0.081557,...,0.370065,-0.223029,0.332431,-0.850499,1,0.412343,-0.761981,-0.922938,0.596038,-0.229313
4,0.326934,1.140267,-1.012557,-1.259403,0.338229,0.380625,0.325889,0.305915,0.340277,-1.389736,...,0.240395,-1.329127,0.33197,0.454971,0,1.029972,0.528835,-0.975679,-1.337822,-1.32502


In [996]:
y = train_n.infected  #.reset_index(drop=True)
X = train_n.drop(['infected'], axis=1)

In [997]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.70,test_size=0.30,random_state=0)#Results dataframe
cols = ['Case','SGD','Ridge','KNN','SVM','Bagging','RndForest','LogReg','LGB', 'xbg']
resul = pd.DataFrame(columns=cols)
resul.set_index("Case",inplace=True)
resul.loc['ACC'] = [0,0,0,0,0,0,0,0,0]

In [998]:
#Models creation
sgd   = SGDClassifier()
ridge = RidgeClassifier()
knn   = KNeighborsClassifier()
svc   = SVC(gamma='auto')
bag   = BaggingClassifier()
rf    = RandomForestClassifier(n_estimators=500)
lr    = LogisticRegression(solver='liblinear')
lgg   = lgb.LGBMClassifier()
xbg   = XGBClassifier()

models = [sgd,ridge,knn,svc,bag,rf,lr,lgg, xbg]

col = 0
for model in models:
    model.fit(X_train,y_train)
    resul.iloc[0,col] = model.score(X_valid,y_valid)
    col += 1
    
resul.head()

Unnamed: 0_level_0,SGD,Ridge,KNN,SVM,Bagging,RndForest,LogReg,LGB,xbg
Case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ACC,0.697917,0.740417,0.620417,0.66375,0.685417,0.725833,0.729583,0.733333,0.72


In [999]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
voting_clf = VotingClassifier(estimators=[('ridge', ridge), ('LGBM', lgg), \
                                          ('LogReg', lr), ('xbg', xbg)\
                                         ,('SGD', sgd),('bag', bag),('rf', rf)
                                         ], voting='hard')
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_valid)
accuracy_score(y_valid, preds)

0.7425

In [1000]:
voting_clf.fit(X,y)
preds = voting_clf.predict(X_valid)
accuracy_score(y_valid, preds)

0.9125

In [1001]:
def save_df(y_pred):
    out_df=pd.DataFrame(y_pred, columns= ['prediction'])
    out_df.index.names = ['id']
    out_df.index += 1 
    out_df.to_csv('out.csv')

In [1002]:
yt_pred = voting_clf.predict(test_n)

In [1003]:
save_df(yt_pred)