<a href="https://colab.research.google.com/github/RyanTahnikoyev/windows-97-robidatathon/blob/main/Merged_Pipeline(ensemble)_X_2_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import os

# Data Manipulation
import pandas as pd
import numpy as np

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs
import seaborn as sns
import plotly
import plotly.graph_objs as go
import plotly.express as px


# Machine Learning Models and Scoring Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics, datasets
from sklearn.metrics import roc_curve, auc


# Preprocessing Components
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline




print("ENVIRONMENT IS READY")

from google.colab import drive 
drive.mount('/content/drive', force_remount=True)


ENVIRONMENT IS READY
Mounted at /content/drive


# Necessary Functions

## Visualization Functions

In [2]:
def roc_curve_visualization(X_valid_full,y_valid, classifier):
    y_score = classifier.predict_proba(X_valid_full)[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_valid,y_score)

    fig = px.area(
        x=fpr, y=tpr,
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='False Positive Rate', y='True Positive Rate'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()



## Preprocessing Functions

In [3]:
def preprocessing(train_data):

    train_data[train_data["s52"]=="o"]=int("0")
    train_data[train_data["s52"]=="l"]=int("1")

    train_data = train_data.drop(['s56','s57','s59'], axis=1)
    train_data.head()

    train_data = train_data.reset_index()
    X_train_full = train_data.copy()

    binary_cols= [cname for cname in X_train_full.columns if
              X_train_full[cname].nunique()==2 and cname!=X_train_full.columns[-1]]

    # "Cardinality" means the number of unique values in a column
    # Select categorical columns with relatively low cardinality (convenient but arbitrary)
    categorical_cols = [cname for cname in X_train_full.columns if
                X_train_full[cname].nunique() < 10 and 
                X_train_full[cname].dtype == "object" and cname 
                not in binary_cols]

    # Select numerical columns
    numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

    binary_df= train_data[binary_cols].copy()

    categorical_df=train_data[categorical_cols].copy()

    from sklearn.preprocessing import LabelEncoder
    for i in binary_df:
        le=LabelEncoder()
        binary_df[i]=le.fit_transform(binary_df[i])

    categorical_df=pd.get_dummies(categorical_df)
    numerical_df = train_data[numerical_cols].copy()

    processed_df = pd.concat([binary_df,categorical_df,numerical_df],axis=1)
    processed_df = processed_df.loc[:, ~processed_df.columns.duplicated()]
    return processed_df

# Data Ingestion and Preprocessing

In [4]:
train_data = pd.read_csv("/content/drive/MyDrive/robi-datathon-2-pre-assessment/train.csv")
# check the raw data
print("Size of the train dataset (train_data, col): ", train_data.shape)

test_data = pd.read_csv("/content/drive/MyDrive/robi-datathon-2-pre-assessment/test.csv")
# check the raw data
print("Size of the test dataset (test_data, col): ", test_data.shape)

Size of the train dataset (train_data, col):  (28322, 36)
Size of the test dataset (test_data, col):  (85065, 35)


In [5]:
# Remove rows with missing target, separate target from predictors
# train_data.dropna(axis=0, subset=['label'], inplace=True)
y = train_data.label
processed_data = preprocessing(train_data)
processed_data = processed_data.drop(['label'], axis=1)
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(processed_data, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

## Training the model

In [16]:
classifier = XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1, tree_method='gpu_hist')

In [19]:
# classifier = RandomForestClassifier(n_estimators = 100, random_state = 0)
# classifier = RandomForestClassifier(n_estimators=50, criterion='gini', min_samples_split=5, min_samples_leaf=2, max_features='auto', bootstrap=True, n_jobs=-1, random_state=42)
# classifier = XGBClassifier(n_estimators=1000,tree_method='gpu_hist')
# classifier = XGBClassifier(n_estimators=50, max_depth=3, objective='binary:logistic',tree_method='gpu_hist')
# classifier = XGBClassifier(gamma= 1, max_depth= 5, n_estimators= 350,objective='binary:logistic',tree_method='gpu_hist')
# classifier = XGBClassifier(base_score=0.5, booster='gbtree',
#                             colsample_bylevel=1, colsample_bynode=1,                            
#                             learning_rate=0.1, max_delta_step=0,                                
#                             missing=None, n_estimators=100, n_jobs=1,
#                             nthread=None, objective='binary:logistic',
#                             random_state=0, colsample_bytree = 0.6655392754230048, gamma = 4.198875359789924,
#                            max_depth = 17, min_child_weight = 1, 
#                            reg_alpha = 57, reg_lambda = 0.896332305739873,
#                            scale_pos_weight=1, seed=None, silent=None,
#                             subsample=1, verbosity=1, tree_method='gpu_hist',)

# dtrain = classifier.DMatrix(X_train_full)
# dtest = classifier.DMatrix(y_train)
# Preprocessing of training data, fit model 
classifier.fit(X_train_full, y_train)

classifier
# Preprocessing of validation data, get predictions
preds = classifier.predict(X_valid_full)

## Scoring

In [113]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, preds)

0.9438658428949691

In [114]:
from sklearn.metrics import f1_score
f1_score(y_valid, preds)

0.8958742632612966

In [115]:
roc_curve_visualization(X_valid_full, y_valid, classifier)

In [116]:
# print metrics for Training set
from sklearn.metrics import precision_score, recall_score, accuracy_score

print("Precision = {}".format(precision_score(y_valid, preds)))
print("Recall = {}".format(recall_score(y_valid, preds)))
print("Accuracy = {}".format(accuracy_score(y_valid, preds)))

Precision = 0.9949090909090909
Recall = 0.8147706968433591
Accuracy = 0.9438658428949691


In [117]:
from sklearn.metrics import  confusion_matrix
confusion_matrix(y_valid, preds)

array([[3979,    7],
       [ 311, 1368]])

## Validation

### Kfold

In [77]:
from sklearn.model_selection import KFold
kfold_validation = KFold(10)
kfold_validation

KFold(n_splits=10, random_state=None, shuffle=False)

In [78]:
from sklearn.model_selection import cross_val_score
results = cross_val_score(classifier,processed_data,y,cv=kfold_validation)
print(results)

[0.93399223 0.93963996 0.94350282 0.94173729 0.93396893 0.9420904
 0.94456215 0.94420904 0.93997175 0.94385593]


In [79]:
classifier

XGBClassifier(tree_method='gpu_hist')

In [11]:
cross_val_scores = cross_val_score(classifier, processed_data, y, scoring='roc_auc', cv=3)
print(cross_val_scores)

[0.94816318 0.95210912 0.9494487 ]


In [75]:
preds = classifier.predict(X_valid_full)
accuracy_score(preds, y_valid)

0.9401588702559577

### Stratified Kfold

In [80]:
from sklearn.model_selection import StratifiedKFold
skfold= StratifiedKFold(n_splits=10)

scores = cross_val_score(classifier,processed_data,y,cv=skfold)
print(results)

[0.93399223 0.93963996 0.94350282 0.94173729 0.93396893 0.9420904
 0.94456215 0.94420904 0.93997175 0.94385593]


# Preparing Test Data for prediction and submission

In [86]:
processed_test_data = preprocessing(test_data)
processed_test_data.head()

Unnamed: 0,s13,s48,n12,n13,gender_0,gender_1,gender_F,gender_M,s11_0,s11_1,...,n4,n5,n6,n7,n8,n9,n10,n11,n14,n15
0,1,1,1,1,0,1,0,0,0,1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
1,1,1,1,1,0,1,0,0,0,1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
2,1,1,1,1,0,1,0,0,0,1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
3,1,0,0,0,0,0,0,1,0,0,...,2.350379,-32.836003,0.015684,-9.1782,1.675028,6.354938,5.427266,1.792147,0.003492,1
4,1,1,0,0,0,0,0,1,0,0,...,8.136887,-32.577495,0.006406,-8.726633,1.963535,6.810158,11.487845,1.880992,0.060614,0


In [87]:
# classifier
ensemble

VotingClassifier(estimators=[('xgb',
                              Pipeline(steps=[('m',
                                               XGBClassifier(tree_method='gpu_hist'))])),
                             ('randomforest',
                              Pipeline(steps=[('m',
                                               RandomForestClassifier())])),
                             ('lgbc',
                              Pipeline(steps=[('m', LGBMClassifier())])),
                             ('cat',
                              Pipeline(steps=[('m',
                                               <catboost.core.CatBoostClassifier object at 0x7fa161da1810>)]))])

In [89]:
preds = ensemble.predict(processed_test_data)

In [90]:
len(preds)

85065

In [91]:
test_data 

Unnamed: 0,id,gender,s11,s12,s13,s16,s17,s18,s48,s52,...,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15
0,1,1,1,1,1,1,1,1,1,1,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1,1,1.000000,1
1,1,1,1,1,1,1,1,1,1,1,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1,1,1.000000,1
2,1,1,1,1,1,1,1,1,1,1,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1,1,1.000000,1
3,b'gAAAAABinOimitAnqlgOcqnD_LeNL3WEbXNGvjd3QVPi...,M,Y,Y,1,D,D,B,0,1,...,0.015684,-9.178200,1.675028,6.354938,5.427266,1.792147,0,0,0.003492,1
4,b'gAAAAABinOi3W9p3Oka5MV_dc2TeorZUcIWOnnODSx7E...,M,Y,Y,1,D,C,B,1,1,...,0.006406,-8.726633,1.963535,6.810158,11.487845,1.880992,0,0,0.060614,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85060,1,1,1,1,1,1,1,1,1,1,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1,1,1.000000,1
85061,1,1,1,1,1,1,1,1,1,1,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1,1,1.000000,1
85062,b'gAAAAABinOi31zWSlD0OMhbBd3_weh7Kq6aPeO4yYqns...,M,N,Y,1,D,A,C,1,1,...,0.015461,-9.249529,1.505547,6.438985,3.429928,1.500925,0,0,0.571895,3
85063,1,1,1,1,1,1,1,1,1,1,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1,1,1.000000,1


In [93]:
# Save test predictions to file
output = pd.DataFrame({'id': test_data.id,
                       'label': preds})
output.to_csv('/content/drive/MyDrive/robi-datathon-2-pre-assessment/submission1250.csv', index=False)

In [92]:
test_data = pd.read_csv("/content/drive/MyDrive/robi-datathon-2-pre-assessment/test.csv")
test_data.head()

Unnamed: 0,id,gender,s11,s12,s13,s16,s17,s18,s48,s52,...,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15
0,b'gAAAAABinOi328DZcweGB4_nOyHA3Dy6o1YKYKyf3COx...,M,Y,Y,1,B,D,D,0,l,...,0.026301,-9.200175,2.12297,6.646617,4.801224,1.693087,0,0,0.43927,2
1,b'gAAAAABinOikutEIBjkUXl9lYTg4RI6jc4NfiMUCcVsn...,M,Y,Y,1,B,C,B,1,l,...,0.010774,-8.897092,1.770889,11.046294,6.40723,1.516728,0,0,0.084334,4
2,b'gAAAAABinOjBM70jBXOroAlUSq5lNXMd_oP0PU7jLQE5...,M,Y,N,1,B,D,B,1,l,...,0.016837,-9.17366,1.871872,2.805834,5.481219,1.788823,0,0,0.98421,0
3,b'gAAAAABinOimitAnqlgOcqnD_LeNL3WEbXNGvjd3QVPi...,M,Y,Y,1,D,D,B,0,1,...,0.015684,-9.1782,1.675028,6.354938,5.427266,1.792147,0,0,0.003492,1
4,b'gAAAAABinOi3W9p3Oka5MV_dc2TeorZUcIWOnnODSx7E...,M,Y,Y,1,D,C,B,1,1,...,0.006406,-8.726633,1.963535,6.810158,11.487845,1.880992,0,0,0.060614,0


In [96]:
y_valid

6423     1
4228     0
28209    0
6737     0
23963    1
        ..
2534     0
18698    0
4152     0
25349    0
26836    0
Name: label, Length: 5665, dtype: int64

In [95]:
len(y_score)

5665

# Ensembling Pipeline

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

# get models
# get a voting ensemble of models
# define the base models
models = list()

decision_tree = Pipeline([('m', DecisionTreeClassifier())])
models.append(('decision', decision_tree))

randomforest = Pipeline([('m', RandomForestClassifier())])
models.append(('randomforest', randomforest))

svc = Pipeline([('m', SVC())])
models.append(('svc', svc))
# define the voting ensemble
ensemble = VotingClassifier(estimators=models, voting='hard')

In [27]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(ensemble, X_train_full, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

In [28]:
n_scores

array([0.94042365, 0.93998235, 0.94042365, 0.9435128 , 0.93601059,
       0.93601059, 0.93424537, 0.94304636, 0.94790287, 0.93598234,
       0.92674316, 0.9435128 , 0.93998235, 0.93909974, 0.94086496,
       0.93865843, 0.94616064, 0.93200883, 0.94525386, 0.93730684,
       0.93556929, 0.94571933, 0.94924978, 0.94130627, 0.94086496,
       0.93601059, 0.93203883, 0.93863135, 0.93730684, 0.93642384])

In [46]:
ensemble.fit(X_train_full,y_train)

VotingClassifier(estimators=[('decision',
                              Pipeline(steps=[('m',
                                               DecisionTreeClassifier())])),
                             ('randomforest',
                              Pipeline(steps=[('m',
                                               RandomForestClassifier())])),
                             ('svc', Pipeline(steps=[('m', SVC())]))])

In [47]:
preds = ensemble.predict(X_valid_full)
accuracy_score(y_valid,preds)

0.9422771403353928

# Ensembling Production

In [65]:
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline


In [None]:
pip install catboost

In [111]:
from lightgbm.sklearn import LGBMClassifier
# get models
# get a voting ensemble of models
# define the base models
models = list()

xgb = Pipeline([('m', XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1, tree_method='gpu_hist'))])
models.append(('xgb', xgb))

randomforest = Pipeline([('m', RandomForestClassifier())])
models.append(('randomforest', randomforest))

svc = Pipeline([('m', SVC())])
models.append(('svc', svc))

lgbc = Pipeline([('m', LGBMClassifier())])
models.append(('lgbc', lgbc))

cat = Pipeline([('m', CatBoostClassifier())])
models.append(('cat', cat))
# define the voting ensemble
ensemble = VotingClassifier(estimators=models, voting='hard')

In [100]:
ensemble

VotingClassifier(estimators=[('xgb',
                              Pipeline(steps=[('m',
                                               XGBClassifier(tree_method='gpu_hist'))])),
                             ('randomforest',
                              Pipeline(steps=[('m',
                                               RandomForestClassifier())]))])

# SMOTE

In [118]:
pip install imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [119]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_sample(processed_data, y)

y_sm.value_counts()

AttributeError: ignored

In [112]:
ensemble.fit(X_train_full, y_train)

Learning rate set to 0.039049
0:	learn: 0.6229007	total: 10.8ms	remaining: 10.8s
1:	learn: 0.5687840	total: 15.8ms	remaining: 7.87s
2:	learn: 0.5202271	total: 24.9ms	remaining: 8.29s
3:	learn: 0.4786927	total: 34.1ms	remaining: 8.5s
4:	learn: 0.4435811	total: 43.6ms	remaining: 8.68s
5:	learn: 0.4131029	total: 53.2ms	remaining: 8.8s
6:	learn: 0.3889763	total: 58.6ms	remaining: 8.31s
7:	learn: 0.3663489	total: 67.2ms	remaining: 8.33s
8:	learn: 0.3470536	total: 75.7ms	remaining: 8.34s
9:	learn: 0.3310237	total: 84ms	remaining: 8.32s
10:	learn: 0.3171427	total: 92.3ms	remaining: 8.3s
11:	learn: 0.3025266	total: 102ms	remaining: 8.36s
12:	learn: 0.2909083	total: 110ms	remaining: 8.35s
13:	learn: 0.2817863	total: 119ms	remaining: 8.36s
14:	learn: 0.2729899	total: 124ms	remaining: 8.13s
15:	learn: 0.2637291	total: 132ms	remaining: 8.13s
16:	learn: 0.2565343	total: 141ms	remaining: 8.17s
17:	learn: 0.2497074	total: 150ms	remaining: 8.19s
18:	learn: 0.2433629	total: 159ms	remaining: 8.19s
19:	l

VotingClassifier(estimators=[('xgb',
                              Pipeline(steps=[('m',
                                               XGBClassifier(tree_method='gpu_hist'))])),
                             ('randomforest',
                              Pipeline(steps=[('m',
                                               RandomForestClassifier())])),
                             ('svc', Pipeline(steps=[('m', SVC())])),
                             ('lgbc',
                              Pipeline(steps=[('m', LGBMClassifier())])),
                             ('cat',
                              Pipeline(steps=[('m',
                                               <catboost.core.CatBoostClassifier object at 0x7fa161cd7990>)]))])

In [102]:
preds = ensemble.predict(X_valid_full)
accuracy_score(y_valid,preds)

0.9438658428949691

In [83]:
train_data.label.value_counts()

0    20084
1     8238
Name: label, dtype: int64

In [33]:
exp_data = processed_data.copy()

In [39]:
numerical_cols = [cname for cname in processed_data.columns if 
               processed_data[cname].dtype in ['int64', 'float64']]
exp_data = exp_data[numerical_cols]

In [42]:
exp_data

Unnamed: 0,s13,s48,n12,n13,index,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n14,n15
0,1,0,0,0,0,16.144666,1.989441,2,2.318385,-32.839277,0.017176,-9.126056,1.732291,3.698504,4.804517,1.544484,0.631220,5
1,1,1,0,0,1,7.144558,0.844866,3,6.197768,-32.576597,0.013857,-9.098287,1.505885,6.791357,6.110416,1.712354,0.392746,3
2,1,0,0,0,2,6.923236,1.042018,6,7.824401,-32.510544,0.013943,-9.234894,1.503828,4.109685,3.953226,1.804260,0.222537,2
3,1,0,0,0,3,5.749840,0.781439,2,8.256767,-32.398679,0.010387,-9.378025,1.485863,7.265876,4.559419,1.537645,0.154409,4
4,1,1,1,1,4,1.000000,1.000000,1,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28317,1,0,0,0,28317,16.375618,2.480253,7,4.693939,-33.086259,0.007216,-8.651173,1.877460,3.637551,11.404942,1.862441,0.019793,5
28318,1,0,0,0,28318,14.746677,2.035971,6,7.939976,-32.775474,0.013369,-8.833046,1.559138,4.247487,5.530054,1.671527,0.371074,2
28319,1,0,0,1,28319,13.429797,1.800562,3,7.763386,-32.575975,0.017782,-9.233722,1.455911,2.900249,6.058681,1.627479,0.826587,1
28320,1,1,0,0,28320,16.094622,2.438280,8,6.942766,-32.577563,0.007324,-8.748941,2.074394,10.673259,11.387646,1.560391,0.103523,5


In [55]:
le.fit_transform(exp_data['n2'])

ValueError: ignored

In [None]:
from sklearn.preprocessing import LabelEncoder
exp_data_fcol = exp_data['n1']
for i in exp_data_fcol:
    le=StandardScaler()
    exp_data[i] = le.fit_transform(exp_data[i])

In [37]:
scaler = StandardScaler()
x_std = scaler.fit_transform(exp_data[numerical_cols])

In [50]:
type(x_std)

numpy.ndarray

In [54]:
i

'n15'