# Additional File No.2  - Project 2

<b> <font color='purple'> 
    
    
  This file contains some <b> Data Sampling Algorithms </b>. These algorithms are run here and then loaded into the oringinal file using joblib inorder take advantage of parallel running of algorithms as this saves time

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from numpy import mean
import matplotlib.pyplot as plt
import seaborn as sns

# for the Q-Q plots
import scipy.stats as stats
%matplotlib inline


# for preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from feature_engine import missing_data_imputers as mdi
from feature_engine import variable_transformers as vt
from feature_engine import outlier_removers as outr
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# for the model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

# ensembling Algorithms
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import StackingClassifier
from  sklearn.ensemble import ExtraTreesClassifier
from  sklearn.ensemble import GradientBoostingClassifier

In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
from xgboost import XGBClassifier

In [1]:
!pip install imblearn



In [3]:
# for unbalanced dataset
from sklearn.model_selection import RepeatedStratifiedKFold

# for oversampling to ensure balanced datasets
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import VotingClassifier

## Loading Dataset

In [45]:
data = pd.read_csv('train.csv')  
test= pd.read_csv("test.csv")

# Data Sampling Algorithms - Oversampling

In [71]:
ftwo_scorer = make_scorer(fbeta_score, beta=2)

## Random Forest with oversampling

### Random Forest with smote

In [50]:
pipe_rand_smote = Pipeline([('smote', SMOTE()), ('model', RandomForestClassifier(random_state=0))])

param_grid = {
 
    'smote__k_neighbors': [1,2,3,4,5],
    'model__n_estimators': [50, 100, 150],
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__max_depth' : [10, 11, 12],
    'model__criterion' :['gini', 'entropy']
}

#apply grid search
grid_rand_smote = GridSearchCV(pipe_rand_smote, param_grid, cv=5, n_jobs=2, scoring=ftwo_scorer)
grid_rand_smote.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.2f}".format(grid_rand_smote.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_rand_smote.best_params_))

# Check test data set performance
print(f'Train score is {grid_rand_smote.score(X_train,y_train)}')
print(f'Test score is {grid_rand_smote.score(X_val,y_val)}')

Best Mean cross-validation score: 0.88
Best parameters: {'model__criterion': 'entropy', 'model__max_depth': 12, 'model__max_features': 'log2', 'model__n_estimators': 50, 'smote__k_neighbors': 5}
Train score is 1.0
Test score is 0.8333333333333334


In [78]:
## To Save Data Sampling algorithms - rfc
from joblib import dump, load
dump(grid_rand_smote, 'rfc data sampling.joblib')

['rfc data sampling.joblib']

### Random Forest with svmsmote

In [103]:
pipe_rand_svmsmote =  Pipeline([('svmsmote', SVMSMOTE()),  ('model', RandomForestClassifier(random_state=0))])

param_svmgrid = {
 
    'svmsmote__k_neighbors': [1,2,3,4,5],
    'model__n_estimators': [50, 100, 150],
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__max_depth' : [10, 11, 12],
    'model__criterion' :['gini', 'entropy']
}

#apply grid search
grid_rand_svmsmote = GridSearchCV(pipe_rand_svmsmote, param_svmgrid , cv=6, n_jobs=2, scoring=ftwo_scorer)
grid_rand_svmsmote.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.2f}".format(grid_rand_svmsmote.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_rand_svmsmote.best_params_))

# Check test data set performance
print(f'Train score is {grid_rand_svmsmote.score(X_train,y_train)}')
print(f'Test score is {grid_rand_svmsmote.score(X_val,y_val)}')

Best Mean cross-validation score: 0.87
Best parameters: {'model__criterion': 'entropy', 'model__max_depth': 11, 'model__max_features': 'log2', 'model__n_estimators': 100, 'svmsmote__k_neighbors': 5}
Train score is 0.9895227008149009
Test score is 0.830945558739255


In [105]:
## To Save Data Sampling algorithms -rfc svmsmote
dump(grid_rand_svmsmote, 'rfc svmsmote.joblib')

['rfc svmsmote.joblib']

## Easy Ensemble Classifier with oversampling

### Easy Ensemble Classifier with base estimator - xgboost

In [52]:
model = EasyEnsembleClassifier(base_estimator=XGBClassifier())

#define a list of parameters
param_eec = {'n_estimators': [30, 50, 100]}

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#apply grid search
grid_eec1 = GridSearchCV(model, param_eec, cv=cv, return_train_score = True, scoring=ftwo_scorer)
grid_eec1.fit(X_train, y_train)

# Mean Cross Validation Score
print("Best Mean Cross-validation score: {:.2f}".format(grid_eec1.best_score_))
print()

#find best parameters
print('Decision Tree parameters: ', grid_eec1.best_params_)

# Check test data set performance
print("Decision Tree Performance Train: ", grid_eec1.score(X_train,y_train))
print("Decision Tree Performance Test: ", grid_eec1.score(X_val,y_val))

Best Mean Cross-validation score: 0.59

Decision Tree parameters:  {'n_estimators': 100}
Decision Tree Performance Train:  0.6515151515151516
Decision Tree Performance Test:  0.6190476190476191


In [79]:
## To Save Data Sampling algorithms - easy ensembli xgb
dump(grid_eec1, 'easy ensembli xgb.joblib')

['easy ensembli xgb.joblib']

### Easy Ensemble Classifier with base estimator - Adaboost

In [53]:
eec = EasyEnsembleClassifier(random_state=0)


#define a list of parameters
param_eec = {'n_estimators': [10, 20, 30]}

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#apply grid search
grid_eec = GridSearchCV(eec, param_eec, cv=cv, return_train_score = True, scoring=ftwo_scorer)
grid_eec.fit(X_train, y_train)

# Mean Cross Validation Score
print("Best Mean Cross-validation score: {:.2f}".format(grid_eec.best_score_))
print()

#find best parameters
print('Decision Tree parameters: ', grid_eec.best_params_)

# Check test data set performance
print("Decision Tree Performance Train: ", grid_eec.score(X_train,y_train))
print("Decision Tree Performance Test: ", grid_eec.score(X_val,y_val))

Best Mean Cross-validation score: 0.58

Decision Tree parameters:  {'n_estimators': 30}
Decision Tree Performance Train:  0.6403574087862993
Decision Tree Performance Test:  0.611111111111111


In [80]:
## To Save Data Sampling algorithms - easy ensembli adab
dump(grid_eec, 'easy ensembli adab.joblib')

['easy ensembli adab.joblib']

## XgBoost with oversampling

### XgBoost with SMOTE Trial 1

In [54]:
pipe_xgbc_smote = Pipeline([('smote', SMOTE()), 
                            ('model',XGBClassifier(random_state=0))])

param_grid = {
 
    'smote__k_neighbors': [1,2,3,4,5],
    'model__learning_rate' : [0.1,0.2,0.6,0.8],
    'model__min_child_weight' : [1,3,5,7],
   
}

#apply grid search
grid_xgbc_smote = GridSearchCV(pipe_xgbc_smote, param_grid, cv=5, n_jobs=2, scoring=ftwo_scorer)
grid_xgbc_smote.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.2f}".format(grid_xgbc_smote.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_xgbc_smote.best_params_))

# Check test data set performance
print(f'Train score is {grid_xgbc_smote.score(X_train,y_train)}')
print(f'Test score is {grid_xgbc_smote.score(X_val,y_val)}')

Best Mean cross-validation score: 0.88
Best parameters: {'model__learning_rate': 0.8, 'model__min_child_weight': 7, 'smote__k_neighbors': 3}
Train score is 0.9976798143851509
Test score is 0.8096590909090908


In [81]:
## To Save Data Sampling algorithms - xgb smote
dump(grid_xgbc_smote, 'xgb smote.joblib')

['xgb smote.joblib']

### XgBoost with SMOTE Trial 2

In [70]:
pipe_xgbc_smote3 = Pipeline([('smote', SMOTE()), 
                            ('model',XGBClassifier(random_state=0))])

param_grid3 = {
 
    'smote__k_neighbors': [1, 2,3,4],
    'model__learning_rate' : [0.7, 0.8, 0.9, 0.95],
    'model__min_child_weight' : [5, 6,7, 8],
   
}

#apply grid search
grid_xgbc_smote3 = GridSearchCV(pipe_xgbc_smote3, param_grid3, cv=5, n_jobs=2, scoring=ftwo_scorer)
grid_xgbc_smote3.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.4f}".format(grid_xgbc_smote3.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_xgbc_smote3.best_params_))

# Check test data set performance
print(f'Train score is {grid_xgbc_smote3.score(X_train,y_train)}')
print(f'Test score is {grid_xgbc_smote3.score(X_val,y_val)}')

Best Mean cross-validation score: 0.8873
Best parameters: {'model__learning_rate': 0.7, 'model__min_child_weight': 8, 'smote__k_neighbors': 2}
Train score is 0.9965237543453072
Test score is 0.8073654390934843


### XgBoost with SVMSMOTE

In [55]:
pipe_xgbc_smote1 = Pipeline([('svmsmote', SVMSMOTE()), 
                            ('model',XGBClassifier(random_state=0))])

param_grid = {
 
    'svmsmote__k_neighbors': [1,2,3,4,5],
    'model__learning_rate' : [0.1,0.2,0.6,0.8],
    'model__min_child_weight' : [1,3,5,7],
   
}

#apply grid search
grid_xgbc_smote1 = GridSearchCV(pipe_xgbc_smote1, param_grid, cv=6, n_jobs=2, scoring=ftwo_scorer)
grid_xgbc_smote1.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.2f}".format(grid_xgbc_smote1.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_xgbc_smote1.best_params_))

# Check test data set performance
print(f'Train score is {grid_xgbc_smote1.score(X_train,y_train)}')
print(f'Test score is {grid_xgbc_smote1.score(X_val,y_val)}')

Best Mean cross-validation score: 0.88
Best parameters: {'model__learning_rate': 0.2, 'model__min_child_weight': 1, 'svmsmote__k_neighbors': 3}
Train score is 1.0
Test score is 0.8192090395480225


In [82]:
## To Save Data Sampling algorithms - xgb svmsmote
dump(grid_xgbc_smote1, 'xgb svmsmote.joblib')

['xgb svmsmote.joblib']

### XgBoost with ADASYN

In [56]:
pipe_xgbc_ada = Pipeline([('adasyn', ADASYN()), 
                            ('model',XGBClassifier(random_state=0))])

param_grid = {
 
    'adasyn__n_neighbors': [1,2,3,4,5],
    'model__learning_rate' : [0.1,0.2,0.6,0.8],
    'model__min_child_weight' : [1,3,5,7],
   
}

#apply grid search
grid_xgbc_ada = GridSearchCV(pipe_xgbc_ada, param_grid, cv=6, n_jobs=2, scoring=ftwo_scorer)
grid_xgbc_ada.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.2f}".format(grid_xgbc_ada.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_xgbc_ada.best_params_))

# Check test data set performance
print(f'Train score is {grid_xgbc_ada.score(X_train,y_train)}')
print(f'Test score is {grid_xgbc_ada.score(X_val,y_val)}')

Best Mean cross-validation score: 0.86
Best parameters: {'adasyn__n_neighbors': 4, 'model__learning_rate': 0.6, 'model__min_child_weight': 1}
Train score is 1.0
Test score is 0.8263305322128852


In [83]:
## To Save Data Sampling algorithms - xgb ada
dump(grid_xgbc_ada, 'xgb ada.joblib')

['xgb ada.joblib']

## Neural Network 

### Neural Network - MLP Classifier

In [61]:
from sklearn.neural_network import MLPClassifier

mlpclf = MLPClassifier(hidden_layer_sizes=(5, 2),random_state=0)

param_grid = {'alpha': [1e-5, 1e-4],
             'solver' : ['lbfgs', 'sgd', 'adam']}


#apply grid search
grid_mlpclf = GridSearchCV(mlpclf, param_grid, cv=6, n_jobs=2, scoring=ftwo_scorer)
grid_mlpclf.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.2f}".format(grid_mlpclf.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_mlpclf.best_params_))

# Check test data set performance
print(f'Train score is {grid_mlpclf.score(X_train,y_train)}')
print(f'Test score is {grid_mlpclf.score(X_val,y_val)}')

Best Mean cross-validation score: 0.83
Best parameters: {'alpha': 1e-05, 'solver': 'adam'}
Train score is 0.8767772511848341
Test score is 0.7879656160458453


In [84]:
## To Save Data Sampling algorithms -neural net
dump(grid_mlpclf, 'neural net.joblib')

['neural net.joblib']

### NLP with SMOTE

In [92]:
pipe_smotemlp = Pipeline([('smote', SMOTE()), 
                    ('model',MLPClassifier(hidden_layer_sizes=(5, 2),random_state=0))])

param_smotemlp = {
 
    'smote__k_neighbors': [1,2,3,4, 5, 6],
     'model__alpha': [1e-5, 1e-4],
    'model__solver' : ['lbfgs', 'sgd', 'adam']}

#apply grid search
grid_smote_mlp = GridSearchCV(pipe_smotemlp, param_smotemlp, cv=5, n_jobs=2, scoring=ftwo_scorer)
grid_smote_mlp.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.4f}".format(grid_smote_mlp.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_smote_mlp.best_params_))

# Check test data set performance
print(f'Train score is {grid_smote_mlp.score(X_train,y_train)}')
print(f'Test score is {grid_smote_mlp.score(X_val,y_val)}')

Best Mean cross-validation score: 0.7145
Best parameters: {'model__alpha': 0.0001, 'model__solver': 'adam', 'smote__k_neighbors': 6}
Train score is 0.8465346534653465
Test score is 0.7195121951219512


## LDA - Linear Discriminant Analysis

In [71]:
pip install lda

Collecting lda
  Downloading https://files.pythonhosted.org/packages/4b/5e/11c73af7335942b9ece20f965271ea632cc26d26d034598502ee4b982251/lda-1.1.0-cp37-cp37m-win_amd64.whl (341kB)
Collecting pbr<4,>=0.6 (from lda)
  Downloading https://files.pythonhosted.org/packages/0c/5d/b077dbf309993d52c1d71e6bf6fe443a8029ea215135ebbe0b1b10e7aefc/pbr-3.1.1-py2.py3-none-any.whl (99kB)
Installing collected packages: pbr, lda
Successfully installed lda-1.1.0 pbr-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [72]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

### LDA with SMOTE

In [76]:
pipe_lda = Pipeline([('smote', SMOTE()), 
                    ('model',LDA())])

param_lda = {
 
    'smote__k_neighbors': [1,2,3,4, 5, 6],
     'model__tol' : [1.0e-4, 1.0e-3, 1.0e-5, 0.001,0.01],
   
}

#apply grid search
grid_smote_lda = GridSearchCV(pipe_lda, param_lda, cv=5, n_jobs=2, scoring=ftwo_scorer)
grid_smote_lda.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.4f}".format(grid_smote_lda.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_smote_lda.best_params_))

# Check test data set performance
print(f'Train score is {grid_smote_lda.score(X_train,y_train)}')
print(f'Test score is {grid_smote_lda.score(X_val,y_val)}')

Best Mean cross-validation score: 0.7222
Best parameters: {'model__tol': 0.001, 'smote__k_neighbors': 1}
Train score is 0.727699530516432
Test score is 0.6750572082379862


In [85]:
## To Save Data Sampling algorithms -lda smote
dump(grid_smote_lda, 'lda smote.joblib')

['lda smote.joblib']

### LDA with SVMSMOTE

In [77]:
pipe_ldasvm = Pipeline([('svmsmote', SVMSMOTE()), 
                          ('model',LDA())])

param_ldasvm = {
 
    'svmsmote__k_neighbors': [1,2,3,4,5],
    'model__tol' : [1.0e-4, 1.0e-3, 1.0e-5, 0.001,0.01],
   
}

#apply grid search
grid_svmsmote_lda = GridSearchCV(pipe_ldasvm, param_ldasvm, cv=6, n_jobs=2, scoring=ftwo_scorer)
grid_svmsmote_lda.fit(X_train,y_train)

# Mean Cross validation Score
print("Best Mean cross-validation score: {:.2f}".format(grid_svmsmote_lda.best_score_))

#find best parameters
print("Best parameters: {}".format(grid_svmsmote_lda.best_params_))

# Check test data set performance
print(f'Train score is {grid_svmsmote_lda.score(X_train,y_train)}')
print(f'Test score is {grid_svmsmote_lda.score(X_val,y_val)}')

Best Mean cross-validation score: 0.73
Best parameters: {'model__tol': 1e-05, 'svmsmote__k_neighbors': 1}
Train score is 0.7359924026590694
Test score is 0.7040572792362767


In [86]:
## To Save Data Sampling algorithms -lda svmsmote
dump(param_ldasvm, 'lda svmsmote.joblib')

['lda svmsmote.joblib']

# Stacking 

## Stacking top data sampling algorithms

In [88]:
# Stacking algorithm using the top models
sclf_datasamp = StackingClassifier(estimators=
                                 [('radmf', grid_rand_smote.best_estimator_),
                                  ('xgbcsvmsmote', grid_xgbc_smote1.best_estimator_), 
                                 ('xgbcsmote', grid_xgbc_smote.best_estimator_),
                              ], final_estimator=DecisionTreeClassifier(random_state=0))

sclfdatasamp_param = {
              'final_estimator__max_depth': range(2,20),
              'stack_method':['auto', 'predict_proba']
             }

sclf_datasamp_grid = GridSearchCV(sclf_datasamp, sclfdatasamp_param,cv=6, return_train_score=True, scoring=ftwo_scorer)
sclf_datasamp_grid.fit(X_train,y_train)


# Mean Cross Validation Score
print("Best Mean Cross-validation score: {:.2f}".format(sclf_datasamp_grid.best_score_))
print()

#find best parameters
print('Best parameters: ',sclf_datasamp_grid.best_params_)


# Check test data set performance
print("Train score : ",sclf_datasamp_grid.score(X_train,y_train))
print("Test score : ",sclf_datasamp_grid.score(X_val,y_val))

Best Mean Cross-validation score: 0.86

Best parameters:  {'final_estimator__max_depth': 3, 'stack_method': 'predict_proba'}
Train score :  1.0
Test score :  0.802292263610315


In [91]:
## To Save Data Sampling algorithms -Stacking
dump(sclf_datasamp_grid, 'Stacking data sampl.joblib')

['Stacking data sampl.joblib']

## Stacking top 4 (cost sensitive and data sampling ) models

In [73]:
# To retrieve etc cost sensit
from joblib import dump, load
etc_grid_bal = load('etc cost.joblib') 

In [74]:
# To retrieve XG boost SVM
from joblib import dump, load
grid_xgbc_smote1 = load('xgb svmsmote.joblib')

In [75]:
# To retrieve rfc smote
grid_rand_smote = load('rfc data sampling.joblib') 

In [76]:
# To retrieve xgb cost
grid_xgboost_bal = load('xgb cost.joblib') 

In [79]:
# Stacking algorithm using the top models
sclf8 = StackingClassifier(estimators=
                                 [('etc', etc_grid_bal.best_estimator_),
                                  ('rfc', grid_rand_smote.best_estimator_),
                                 ('xgb cost', grid_xgboost_bal.best_estimator_),
                                ('xgbcsvmsmote',grid_xgbc_smote1.best_estimator_),
                              ], final_estimator=LogisticRegression())

sclf_param8 = {
            'final_estimator__C' : [0.001, 0.01, 0.1, 1],
              'stack_method':['auto', 'predict_proba']
             }

sclf_grid8 = GridSearchCV(sclf8, sclf_param8,cv=6, return_train_score=True, scoring=ftwo_scorer)
sclf_grid8.fit(X_train,y_train)


# Mean Cross Validation Score
print("Best Mean Cross-validation score: {:.2f}".format(sclf_grid8.best_score_))
print()

#find best parameters
print('Best parameters: ',sclf_grid8.best_params_)


# Check test data set performance
print("Train score : ",sclf_grid8.score(X_train,y_train))
print("Test score : ",sclf_grid8.score(X_val,y_val))

Best Mean Cross-validation score: 0.86

Best parameters:  {'final_estimator__C': 1, 'stack_method': 'auto'}
Train score :  1.0
Test score :  0.8262108262108261


## Stacking overall top models 

In [84]:
# Stacking algorithm using the top models
sclf8 = StackingClassifier(estimators=
                           [('rfc data sampling', grid_rand_smote.best_estimator_),
                                  ('etc', etc_grid_bal.best_estimator_), 
                                  ('xgb svmsmote',grid_xgbc_smote1.best_estimator_),
                              ], final_estimator=LogisticRegression(class_weight = {0: 1, 1: 1}))
                           
                           
sclf_param8 = {
            'final_estimator__C' : [0.001, 0.01, 0.1, 1],
              'stack_method':['auto', 'predict_proba']
             }

sclf_grid8 = GridSearchCV(sclf8, sclf_param8,cv=6, return_train_score=True, scoring=ftwo_scorer)
sclf_grid8.fit(X_train,y_train)


# Mean Cross Validation Score
print("Best Mean Cross-validation score: {:.2f}".format(sclf_grid8.best_score_))
print()

#find best parameters
print('Best parameters: ',sclf_grid8.best_params_)


# Check test data set performance
print("Train score : ",sclf_grid8.score(X_train,y_train))
print("Test score : ",sclf_grid8.score(X_val,y_val))

Best Mean Cross-validation score: 0.86

Best parameters:  {'final_estimator__C': 1, 'stack_method': 'auto'}
Train score :  1.0
Test score :  0.8092485549132947


# Voting of best overall models

In [85]:
# To retrieve decision tree
grid_dtree = load('dtree basic.joblib') 

In [93]:
# To retrieve
grid_xgbc_ada = load('xgb ada.joblib') 

In [95]:
# To retrieve bagg
grid_param_dt_bag = load('bagging cost1.joblib') 

In [98]:
vclf_all = VotingClassifier(estimators=
                               [('dtree basic', grid_dtree.best_estimator_), 
                                   ('xgb svmsmote',grid_xgbc_smote1.best_estimator_),
                                  ('rfc data sampling', grid_rand_smote.best_estimator_),
                                  ('etc cost', etc_grid_bal.best_estimator_), 
                                ('xgb adn', grid_xgbc_ada.best_estimator_),
                                 ('bagg', grid_param_dt_bag.best_estimator_)
                              ], )
vclf_all_param = {
              'voting' : ['hard','soft'],
             }
vclf_all_grid = GridSearchCV(vclf_all, vclf_all_param, cv=6, return_train_score=True, scoring=ftwo_scorer)
vclf_all_grid.fit(X_train,y_train)

# Mean Cross Validation Score
print("Best Mean Cross-validation score: {:.2f}".format(vclf_all_grid.best_score_))
print()

#find best parameters
print('Best parameters: ',vclf_all_grid.best_params_)


# Check test data set performance
print("Train score : ",vclf_all_grid.score(X_train,y_train))
print("Test score : ",vclf_all_grid.score(X_val,y_val))

Best Mean Cross-validation score: 0.87

Best parameters:  {'voting': 'soft'}
Train score :  1.0
Test score :  0.8189655172413792


# Prediction on Test Data

In [81]:
X_test=test.drop(['Id'], axis=1)
X_test.shape

(24846, 29)

In [82]:
# Apply Transformations
X_test=data_preprocess.transform(X_test)

## Final model - SMOTE RFC

In [66]:
kaggle_model =Pipeline([('smote', SMOTE(k_neighbors = 5)), 
                        ('model', RandomForestClassifier(random_state=0, n_estimators = 50, 
                                                         max_features = 'log2', max_depth =12,criterion = 'entropy'))])
   
kaggle_model.fit(X_train,y_train)

# Preprocessing of validation data, get predictions
test_data_labels = kaggle_model.predict(X_test)

# Create predictions to be submitted!
pd.DataFrame({'Id': test.Id, 'Target': test_data_labels}).to_csv('solution_base_rfcsmote.csv', index =False)  


## Final model - SVMSMOTE XGB

In [87]:
kaggle_model3 =Pipeline([('svmsmote', SVMSMOTE(k_neighbors = 3)), 
                            ('model',XGBClassifier(random_state=0, learning_rate = 0.2, min_child_weight = 1))])
   
kaggle_model3.fit(X_train,y_train)

# Preprocessing of validation data, get predictions
test_data_labels = kaggle_model3.predict(X_test)

# Create predictions to be submitted!
pd.DataFrame({'Id': test.Id, 'Target': test_data_labels}).to_csv('solution_xgbost_svm.csv', index =False)  

## Final model - top data sampling models with stacking

In [89]:
kaggle_model5 = StackingClassifier(stack_method = 'predict_proba', estimators=
                                 [('radmf', grid_rand_smote.best_estimator_),
                                  ('xgbcsvmsmote', grid_xgbc_smote1.best_estimator_), 
                                 ('xgbcsmote', grid_xgbc_smote.best_estimator_),
                              ], final_estimator=DecisionTreeClassifier(random_state=0, max_depth = 3 ))

kaggle_model5.fit(X_train,y_train)

# Preprocessing of validation data, get predictions
test_data_labels = kaggle_model5.predict(X_test)

# Create predictions to be submitted!
pd.DataFrame({'Id': test.Id, 'Target': test_data_labels}).to_csv('solution_stackdatasamp.csv', index =False)  

In [90]:
kaggle_model6 = StackingClassifier(stack_method = 'auto', estimators=
                                 [('radmf', grid_rand_smote.best_estimator_),
                                  ('xgbcsvmsmote', grid_xgbc_smote1.best_estimator_), 
                                 ('xgbcsmote', grid_xgbc_smote.best_estimator_),
                              ], final_estimator=DecisionTreeClassifier(random_state=0, max_depth = 3 ))

kaggle_model6.fit(X_train,y_train)

# Preprocessing of validation data, get predictions
test_data_labels = kaggle_model6.predict(X_test)

# Create predictions to be submitted!
pd.DataFrame({'Id': test.Id, 'Target': test_data_labels}).to_csv('solution_stackdatasamp_auto.csv', index =False) 

## Final model - overall models with stacking (final estimator - logis)

In [83]:
kaggle_model11 = StackingClassifier(stack_method = 'auto', estimators=
                                  [('etc', etc_grid_bal.best_estimator_),
                                  ('rfc', grid_rand_smote.best_estimator_),
                                 ('xgb cost', grid_xgboost_bal.best_estimator_),
                                ('xgbcsvmsmote',grid_xgbc_smote1.best_estimator_),
                              ], final_estimator=LogisticRegression(C = 1))


kaggle_model11.fit(X_train,y_train)

# Preprocessing of validation data, get predictions
test_data_labels = kaggle_model11.predict(X_test)

# Create predictions to be submitted!
pd.DataFrame({'Id': test.Id, 'Target': test_data_labels}).to_csv('solution_overall top models - Stacklog.csv', index =False) 

## Final model - overall models with stacking (final estimator - logis with class weights)

In [104]:
kaggle_model21 = StackingClassifier(stack_method = 'auto', estimators=
                           [('rfc data sampling', grid_rand_smote.best_estimator_),
                                  ('etc', etc_grid_bal.best_estimator_), 
                                  ('xgb svmsmote',grid_xgbc_smote1.best_estimator_),
                              ], final_estimator=LogisticRegression(C = 1, class_weight = {0: 1, 1: 1}))


kaggle_model21.fit(X_train,y_train)

# Preprocessing of validation data, get predictions
test_data_labels = kaggle_model21.predict(X_test)

# Create predictions to be submitted!
pd.DataFrame({'Id': test.Id, 'Target': test_data_labels}).to_csv('solution_overall top models - Stacklog with class weights.csv', index =False) 

## Final model - overall models with voting

In [99]:
kaggle_model13 = VotingClassifier(voting = 'soft',estimators=
                               [('dtree basic', grid_dtree.best_estimator_), 
                                   ('xgb svmsmote',grid_xgbc_smote1.best_estimator_),
                                  ('rfc data sampling', grid_rand_smote.best_estimator_),
                                  ('etc cost', etc_grid_bal.best_estimator_), 
                                ('xgb adn', grid_xgbc_ada.best_estimator_),
                                 ('bagg', grid_param_dt_bag.best_estimator_)
                              ], )
kaggle_model13.fit(X_train,y_train)

# Preprocessing of validation data, get predictions
test_data_labels = kaggle_model13.predict(X_test)

# Create predictions to be submitted!
pd.DataFrame({'Id': test.Id, 'Target': test_data_labels}).to_csv('solution_Voting overall.csv', index =False) 