In [71]:
import os
import numpy as np  # linear algebra
import pandas as pd  #
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
%matplotlib inline

np.random.seed(7)

In [208]:
cols_remove = ['hours_worked_each_week', 'full_name']
label_features = ['education_status', 'sports_engagement', 'diet_type', 'favorite_color', 'favorite_music_genre', \
                  'owned_car_brand', 'blood_type', 'occupation']

In [234]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
ss = StandardScaler()
le = LabelEncoder()

def preprocess(df, train = True):
    df = create_features(df)
#     df = label_encoding(df, train)
    df = df.drop(cols_remove, axis = 1)
    if train:
        df[num_cols] =  ss.fit_transform(df[num_cols])
    else:
        df[num_cols] =  ss.transform(df[num_cols])
#     df = pd.get_dummies(df, drop_first = True)
    return df

def create_features(df):
    df['BMI'] = df['weight'] / df['height'] ** 2
    df['daily_person_water_usage'] = df['daily_household_water_usage'] / df['family_members_count']
    return df

def label_encoding(df, train = True):
    if train:
        for feat in label_features:
            df[feat] = le.fit_transform(df[feat])
    else:
        for feat in label_features:
            df[feat] = le.transform(df[feat])
    return df

In [235]:
from sklearn.model_selection import KFold


def test_mean_target_encoding(train, test, target, categorical, alpha=5):
    # Calculate global mean on the train data
    global_mean = train[target].mean()
    
    # Group by the categorical feature and calculate its properties
    train_groups = train.groupby(categorical)
    category_sum = train_groups[target].sum()
    category_size = train_groups.size()
    
    # Calculate smoothed mean target statistics
    train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
    
    # Apply statistics to the test data and fill new categories
    test_feature = test[categorical].map(train_statistics).fillna(global_mean)
    return test_feature.values


def train_mean_target_encoding(train, target, categorical, alpha=5):
    # Create 5-fold cross-validation
    kf = KFold(n_splits=5, random_state=123, shuffle=True)
    train_feature = pd.Series(index=train.index)
    
    # For each folds split
    for train_index, test_index in kf.split(train):
        cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
      
        # Calculate out-of-fold statistics and apply to cv_test
        cv_test_feature = test_mean_target_encoding(cv_train, cv_test, target, categorical, alpha)
        
        # Save new feature for this particular fold
        train_feature.iloc[test_index] = cv_test_feature       
    return train_feature.values


def mean_target_encoding(train, test, target, categorical, alpha=5):
  
    # Get test feature
    test_feature = test_mean_target_encoding(train, test, target, categorical, alpha)
    
    # Get train feature
    train_feature = train_mean_target_encoding(train, target, categorical, alpha)
    
    # Return new features to add to the model
    return train_feature, test_feature
    

In [236]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,gender,full_name,year_of_birth,weight,height,eye_color,race,accommodation_type,education_status,blood_type,...,owns_a_pet,has_health_insurance,has_cancer,smokes,has_alzheimers,facial_hair,daily_household_water_usage,diet_type,fast_food_meals_per_month,infected
0,male,Gerald Valentine,2024,8.69,52.32,brown,black,apartment,not_applicable,A+,...,no,no,no,no,no,none,1471.95,regular,2,0
1,male,Chad Sell,1956,93.21,157.04,brown,asian,villa,associate_degree,AB+,...,no,no,no,no,no,short,1969.39,regular,0,0
2,male,Alex Crawford,1949,83.79,187.7,brown,white,apartment,associate_degree,B+,...,yes,no,no,no,yes,none,366.43,regular,16,0
3,female,Mildred Valentine,1992,90.29,159.2,brown,middle_eastern,apartment,high_school,O+,...,yes,yes,no,no,no,none,1548.2,regular,1,1
4,male,Francisco Hill,2013,40.23,96.61,brown,asian,apartment,not_applicable,O-,...,yes,yes,no,no,no,long,361.75,regular,12,0


In [237]:
test = pd.read_csv('test.csv')

In [243]:
train_n = preprocess(train)
test_n = preprocess(test, False)
train_n.head()

Unnamed: 0,gender,year_of_birth,weight,height,eye_color,race,accommodation_type,education_status,blood_type,family_members_count,...,has_cancer,smokes,has_alzheimers,facial_hair,daily_household_water_usage,diet_type,fast_food_meals_per_month,infected,BMI,daily_person_water_usage
0,male,1.606812,-2.234327,-2.266887,brown,black,apartment,not_applicable,A+,-0.408602,...,no,no,no,none,-0.294115,regular,2,0,0.003175,367.9875
1,male,-1.277283,1.039737,0.115224,brown,asian,villa,associate_degree,AB+,-0.081557,...,no,no,no,short,0.169637,regular,0,0,0.00378,393.878
2,male,-1.574175,0.674833,0.81266,brown,white,apartment,associate_degree,B+,-1.389736,...,no,no,yes,none,-1.324764,regular,16,0,0.002378,366.43
3,female,0.249591,0.926625,0.164358,brown,middle_eastern,apartment,high_school,O+,-0.081557,...,no,no,no,none,-0.223029,regular,1,1,0.003562,309.64
4,male,1.140267,-1.012557,-1.259403,brown,asian,apartment,not_applicable,O-,-1.389736,...,no,no,no,long,-1.329127,regular,12,0,0.00431,361.75


In [244]:
cat_cols = train_n.select_dtypes(include=['object']).columns
print("Number of categorical fields:", len(cat_cols))
cat_cols

Number of categorical fields: 19


Index(['gender', 'eye_color', 'race', 'accommodation_type', 'education_status',
       'blood_type', 'occupation', 'living_area', 'sports_engagement',
       'favorite_music_genre', 'favorite_color', 'owned_car_brand',
       'owns_a_pet', 'has_health_insurance', 'has_cancer', 'smokes',
       'has_alzheimers', 'facial_hair', 'diet_type'],
      dtype='object')

In [245]:
num_cols = train_n._get_numeric_data().columns
num_cols = [x for x in num_cols if x != 'infected' and x != 'fast_food_meals_per_month']
print("Number of numerical fields:", len(num_cols))
num_cols

Number of numerical fields: 8


['year_of_birth',
 'weight',
 'height',
 'family_members_count',
 'avg_sleep_hours',
 'daily_household_water_usage',
 'BMI',
 'daily_person_water_usage']

In [246]:
for col in cat_cols:
    train_n[col], test_n[col] = mean_target_encoding(train=train_n,
                                                   test=test_n,
                                                   target='infected',
                                                   categorical=col,
                                                   alpha=5)



In [248]:
train_n.head()

Unnamed: 0,gender,year_of_birth,weight,height,eye_color,race,accommodation_type,education_status,blood_type,family_members_count,...,has_cancer,smokes,has_alzheimers,facial_hair,daily_household_water_usage,diet_type,fast_food_meals_per_month,infected,BMI,daily_person_water_usage
0,0.335857,1.606812,-2.234327,-2.266887,0.345297,0.349165,0.332043,0.314111,0.335661,-0.408602,...,0.348813,0.352907,0.353284,0.374633,-0.294115,0.334867,2,0,0.003175,367.9875
1,0.335857,-1.277283,1.039737,0.115224,0.345297,0.383186,0.415673,0.31055,0.340934,-0.081557,...,0.348813,0.352907,0.353284,0.303962,0.169637,0.334867,0,0,0.00378,393.878
2,0.332152,-1.574175,0.674833,0.81266,0.33991,0.332116,0.322231,0.31939,0.421688,-1.389736,...,0.343989,0.351975,0.215356,0.370095,-1.324764,0.332415,16,0,0.002378,366.43
3,0.363403,0.249591,0.926625,0.164358,0.33991,0.257596,0.322231,0.391724,0.338594,-0.081557,...,0.343989,0.351975,0.349001,0.370095,-0.223029,0.332415,1,1,0.003562,309.64
4,0.326899,1.140267,-1.012557,-1.259403,0.338219,0.380696,0.325859,0.30572,0.340201,-1.389736,...,0.343309,0.349699,0.348382,0.239511,-1.329127,0.331956,12,0,0.00431,361.75


In [255]:
y = train_n.infected  #.reset_index(drop=True)
X = train_n.drop(['infected'], axis=1)

In [256]:
X.columns

Index(['gender', 'year_of_birth', 'weight', 'height', 'eye_color', 'race',
       'accommodation_type', 'education_status', 'blood_type',
       'family_members_count', 'avg_sleep_hours', 'occupation', 'living_area',
       'sports_engagement', 'favorite_music_genre', 'favorite_color',
       'owned_car_brand', 'owns_a_pet', 'has_health_insurance', 'has_cancer',
       'smokes', 'has_alzheimers', 'facial_hair',
       'daily_household_water_usage', 'diet_type', 'fast_food_meals_per_month',
       'BMI', 'daily_person_water_usage'],
      dtype='object')

In [257]:
# cat2 = ['has_cancer', 'smokes', 'has_alzheimers', 'facial_hair', 'diet_type', 'owned_car_brand']

In [258]:
# def expanding_mean(df_test, col):
#     cumsum   = train.groupby(col)['infected'].cumsum() - train['infected']
#     cumcnt = df_test.groupby(col).cumcount()
#     df_test[col + '_mean'] = cumsum/cumcnt
#     df_test[col + '_mean'] = df_test[col + '_mean'].replace([np.inf, -np.inf], np.nan)
#     df_test[col + '_mean'].fillna(0.3343, inplace=True)
#     return df_test

# for col in cat2:
#     features = expanding_mean(features, col)

# features.head()

In [259]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.70,test_size=0.30,random_state=0)#Results dataframe
cols = ['Case','SGD','Ridge','KNN','SVM','Bagging','RndForest','LogReg','LGB', 'xbg']
resul = pd.DataFrame(columns=cols)
resul.set_index("Case",inplace=True)
resul.loc['Standard'] = [0,0,0,0,0,0,0,0,0]
resul.loc['GridSearch'] = [0,0,0,0,0,0,0,0,0]
resul.loc['RandomSearch'] = [0,0,0,0,0,0,0,0,0]
resul.loc['Hyperopt'] = [0,0,0,0,0,0,0,0,0]

In [260]:
#Models creation
from xgboost import XGBClassifier
sgd   = SGDClassifier()
ridge = RidgeClassifier()
knn   = KNeighborsClassifier()
svc   = SVC(gamma='auto')
bag   = BaggingClassifier()
rf    = RandomForestClassifier(n_estimators=10)
lr    =  LogisticRegression(solver='liblinear')
lgg   = lgb.LGBMClassifier()
xbg   = XGBClassifier()

models = [sgd,ridge,knn,svc,bag,rf,lr,lgg, xbg]

col = 0
for model in models:
    model.fit(X_train,y_train)
    resul.iloc[0,col] = model.score(X_valid,y_valid)
    col += 1
    
resul.head()

Unnamed: 0_level_0,SGD,Ridge,KNN,SVM,Bagging,RndForest,LogReg,LGB,xbg
Case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Standard,0.544583,0.739167,0.60875,0.659583,0.68875,0.6875,0.72375,0.739583,0.722917
GridSearch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RandomSearch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Hyperopt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
voting_clf = VotingClassifier(estimators=[('ridge', ridge), ('LGBM', lgg), \
                                          ('LogReg', lr), ('xbg', xbg)], voting='hard')
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_valid)
accuracy_score(y_valid, preds)

0.7370833333333333

In [195]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

#Ridge
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
ridge_grid = dict(alpha=alpha)#K-Nearest - Neighborg
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
knn_grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

#Logistic Regrresion
solvers = [ 'liblinear']
penalty = ['l2', 'l1']
c_values = [100, 10, 1.0, 0.1, 0.01]
lr_grid = dict(solver=solvers,penalty=penalty,C=c_values)

#LGB
class_weight = [None,'balanced']
boosting_type = ['gbdt', 'goss', 'dart']
num_leaves = [30,50,100,150] #list(range(30, 150)),
learning_rate = list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 10)) #1000
lgg_grid = dict(class_weight=class_weight, boosting_type=boosting_type, num_leaves=num_leaves, learning_rate =learning_rate)

In [84]:
from sklearn.metrics import accuracy_score
lr    =  LogisticRegression(solver='liblinear')
grid_win = lr.fit(X_train, y_train)

#Predict values based on new parameters
yv_pred = grid_win.predict(X_valid)

print(accuracy_score(y_valid, yv_pred))

0.7533333333333333


In [262]:
def save_df(y_pred):
    out_df=pd.DataFrame(y_pred, columns= ['prediction'])
    out_df.index.names = ['id']
    out_df.index += 1 
    out_df.to_csv('out.csv')

In [263]:
yt_pred = voting_clf.predict(test_n)

In [264]:
save_df(yt_pred)