In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [2]:
def average(df):
    sum = 0
    count = 0
    for x in df['f1']:
        if x!=-1:
            sum+=x
            count+=1
    avg = sum/count
    return avg


In [3]:
def results(model, X_train, X_valid,y_train, y_valid):
    soft_probs = model.predict_proba(X_valid)
    valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
    print(valid_auc)
    train_preds = model.predict(X_train)
    acc = accuracy_score(y_train, train_preds)
    print(acc)
    preds = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    print(acc)
    print(classification_report(y_valid,preds))

In [4]:
#Timer function
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [5]:
df = pd.read_csv("train_final.csv")
df.describe()
f1_avg = average(df)
df['f1'].replace(-1,f1_avg, inplace = True)
df.describe()

Unnamed: 0,Id,Y,f1,f2,f3,f4,f5,f6,f7,f8,...,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24
count,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,...,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0
mean,8192.0,0.942135,43031.41572,1.044375,11.770938,118323.581456,1.044436,0.050052,117089.674113,169730.1786,...,25894.316914,119045.099005,184622.040835,1.047305,125959.667765,1.044558,1.045718,1.041934,32718.9,1.043948
std,4729.509065,0.233495,33596.053696,0.264806,353.187115,4518.059755,0.265601,0.293892,10261.29297,69396.677853,...,36086.993946,18321.987129,100590.811845,0.306239,31091.344158,0.262576,0.266874,0.246597,3184929.0,0.25964
min,1.0,0.0,37.0,1.0,1.77,23779.0,1.0,0.0,4292.0,4673.0,...,25.0,4674.0,3130.0,1.0,117879.0,1.0,1.0,1.0,1.0,1.0
25%,4096.5,1.0,20331.0,1.0,1.77,118096.0,1.0,0.0,117961.0,117906.0,...,4554.0,118395.0,118398.0,1.0,118274.0,1.0,1.0,1.0,1.0,1.0
50%,8192.0,1.0,35530.0,1.0,1.77,118300.0,1.0,0.0,117961.0,128130.0,...,13234.0,118929.0,119095.0,1.0,118568.0,1.0,1.0,1.0,2.0,1.0
75%,12287.5,1.0,74240.5,1.0,3.54,118386.0,1.0,0.0,117961.0,234498.5,...,38902.0,120539.0,290919.0,1.0,120006.0,1.0,1.0,1.0,9.0,1.0
max,16383.0,1.0,312152.0,7.0,43910.16,286791.0,9.0,10.0,311178.0,311867.0,...,311696.0,286792.0,308574.0,18.0,311867.0,8.0,8.0,7.0,404288600.0,8.0


In [6]:
y = df.loc[:,'Y']
X = df.loc[:,'f1':'f24']
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size = 0.75, test_size = 0.25,random_state = 42, shuffle = True)

In [7]:
model_xgb = XGBClassifier(n_estimators=800, max_depth=4, learning_rate=0.05, n_jobs=-1, random_state=42) 
model_xgb.fit(X_train, y_train)
soft_probs = model_xgb.predict_proba(X_valid)
valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
print(valid_auc)
train_preds = model_xgb.predict(X_train)
acc = accuracy_score(y_train, train_preds)
print(acc)
preds = model_xgb.predict(X_valid)
acc = accuracy_score(y_valid, preds)
print(acc)

0.8648734965318772
0.9721657035891593
0.961181640625


Tuning without feature engineering

In [7]:
params = {
        'min_child_weight': [1, 5, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'max_depth': [4, 5, 6],
        'n_estimators': [800, 1000, 1200]
        }

In [15]:
xgb = XGBClassifier(learning_rate=0.05,
                    silent=True, nthread=1)

In [17]:
# folds = 5
# param_comb = 20 #number of random parameter combos to pick
# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
# grid_search = GridSearchCV(xgb, param_grid=params,scoring='roc_auc', 
#                             n_jobs=-1, cv=skf.split(X_train,y_train),verbose=3)
# start_time = timer(None) # timing starts from this point for "start_time" variable
# model4 = grid_search.fit(X_train, y_train)
# timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 59.7min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 87.2min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed: 92.8min finished


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 1 hours 33 minutes and 3.72 seconds.


In [18]:
soft_probs = model4.predict_proba(X_valid)
valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
print(valid_auc)
train_preds = model4.predict(X_train)
acc = accuracy_score(y_train, train_preds)
print(acc)
preds = model4.predict(X_valid)
acc = accuracy_score(y_valid, preds)
print(acc)

0.875433515348288
0.9798160657605599
0.9619140625


In [19]:
print(model4.best_params_)

{'colsample_bytree': 0.6, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 1000, 'subsample': 0.8}


{'colsample_bytree': 0.6, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 1000, 'subsample': 0.8} These are the results from gridsearch

Slight improvement but the grid search was not worth it. Lets just submit this for the sake of a submission. 

In [22]:
testdf = pd.read_csv("test_final.csv")
testdf['f1'].replace(-1,f1_avg, inplace = True)
X_test = testdf.loc[:,'f1':'f24']
#X_test = scaler.transform(X_test)
test_preds = model_grid.predict_proba(X_test)
sub3 = {"Id":testdf['Id'],"Y":test_preds[:,1]}
sub3 = pd.DataFrame(data=sub3)
sub3.to_csv("submissions/sub3.csv", index=False)

Submission improved score from previous XGBoost. from 0.87347 to 0.88364

In [8]:
model_grid = XGBClassifier(n_estimators = 1000,colsample_bytree=0.6,learning_rate=0.05, njobs=-1,
                    max_depth = 4, random_state=42)
model_grid.fit(X_train, y_train)

Parameters: { njobs } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, njobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [9]:
soft_probs = model_grid.predict_proba(X_valid)
valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
print(valid_auc)
train_preds = model_grid.predict(X_train)
acc = accuracy_score(y_train, train_preds)
print(acc)
preds = model_grid.predict(X_valid)
acc = accuracy_score(y_valid, preds)
print(acc)

0.8714903704250295
0.9750142426955318
0.961669921875


In [10]:
#What if I normalize all data and use it
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
model_grid.fit(X_train, y_train)

Parameters: { njobs } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, njobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [11]:
soft_probs = model_grid.predict_proba(X_valid)
valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
print(valid_auc)
train_preds = model_grid.predict(X_train)
acc = accuracy_score(y_train, train_preds)
print(acc)
preds = model_grid.predict(X_valid)
acc = accuracy_score(y_valid, preds)
print(acc)

0.8714938293240849
0.9750142426955318
0.961669921875


Normalizing did not make ANY significant changes to the scores. Continue tuning without normalization. Continue Random search CV instead of grid. Ill now build off of the grid search results. 

In [136]:
y = df.loc[:,'Y']
X = df.loc[:,'f1':'f24']
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size = 0.75, test_size = 0.25,random_state = 42, shuffle = True)

In [114]:
params = {
        'n_estimators' : [1530,1550,1570],
        'learning_rate' : [0.01,0.011,0.012],
        'colsample_bytree':[0.3,0.4],
        'colsample_bylevel':[0.5,0,6]
        }

In [115]:
model_tune = XGBClassifier(subsample = 0.7,max_depth=8,
                            n_jobs=-1,base_score = 0.55,
                            random_state=42,)

In [117]:
folds = 6
param_comb = 25 #number of random parameter combos to pick
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search = RandomizedSearchCV(model_tune, param_distributions=params,
                                   n_iter=param_comb, scoring='roc_auc', 
                                   n_jobs=-1, cv=skf.split(X_train,y_train), 
                                   verbose=3, random_state=42 )
start_time = timer(None) # timing starts from this point for "start_time" variable
tuned = random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 6 folds for each of 25 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.4min finished



 Time taken: 0 hours 6 minutes and 38.11 seconds.


In [118]:
results(tuned,X_train, X_valid, y_train, y_valid)

0.8808005737160567
0.9777813949702938
0.960205078125
              precision    recall  f1-score   support

           0       0.92      0.30      0.45       224
           1       0.96      1.00      0.98      3872

    accuracy                           0.96      4096
   macro avg       0.94      0.65      0.72      4096
weighted avg       0.96      0.96      0.95      4096



In [119]:
tuned.best_params_

{'n_estimators': 1530,
 'learning_rate': 0.012,
 'colsample_bytree': 0.3,
 'colsample_bylevel': 0.5}

{'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.03}

In [7]:
model_trial = XGBClassifier(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
model_trial.fit(X_train, y_train)
results(model_trial,X_train, X_valid, y_train, y_valid)

0.8809746716351832
0.975665337348417
0.960205078125
              precision    recall  f1-score   support

           0       0.92      0.30      0.45       224
           1       0.96      1.00      0.98      3872

    accuracy                           0.96      4096
   macro avg       0.94      0.65      0.72      4096
weighted avg       0.96      0.96      0.95      4096



Until now we had NOT been training on the entire dataset LOOL. We can do much better. Lets try the above on the entire dataset.

In [191]:
model_trial.fit(X,y)

XGBClassifier(base_score=0.55, booster='gbtree', colsample_bylevel=0.5,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.011, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1550, n_jobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.7, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [194]:
soft_probs = model_trial.predict_proba(X)
valid_auc = roc_auc_score(y, soft_probs[:,1])
print(valid_auc)
train_preds = model_trial.predict(X)
acc = accuracy_score(y, train_preds)
print(acc)

0.9972919648068188
0.9722273087957028


In [None]:
#Counter testing

## good parameter log
After some tuning, lets submit and check what our score is for these params
(colsample_bytree=0.6,subsample = 0.95,max_depth=5,
                            n_estimators=1310, learning_rate =0.02,
                            random_state=42, gamma=1, min_child_weight = 2)
th validation accuracy is pmuch the same however there is some regularization I want to test the performance of. 
(colsample_bytree=0.6,subsample = 0.8,max_depth=4,
                            n_estimators=1300, learning_rate =0.03,
                            random_state=42)
(colsample_bytree=0.4,subsample = 0.7,max_depth=8,
                            n_estimators=1850, learning_rate =0.01,
                            colsample_bylevel=0.7,base_score = 0.55,
                            random_state=42,)
Hit 0.88 auc on validation with this one (colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
Big realization: Had not been fitting on entire dataset after validation. Now I fit it. Time to make a submission with the about best model.

In [195]:
testdf = pd.read_csv("test_final.csv")
testdf['f1'].replace(-1,f1_avg, inplace = True)
X_test = testdf.loc[:,'f1':'f24']
test_preds = model_trial.predict_proba(X_test)
sub5 = {"Id":testdf['Id'],"Y":test_preds[:,1]}
sub5 = pd.DataFrame(data=sub5)
sub5.to_csv("submissions/sub5.csv", index=False)

(colsample_bytree=0.6,subsample = 0.95,max_depth=5,
                            n_estimators=1310, learning_rate =0.02,
                            random_state=42, gamma=1, min_child_weight = 2)Did not work. Still need to tune the parameters

GREAT
(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
XGBoosting on this one improved score on leader board to AUC 0.90550.
Time to continue tuning. 
Notice how the recall is small. Need to improve recall for minority class. 


Now I will explore techniques to address the imbalanced dataset like class weights with the scale_pos_weight parameter. https://stats.stackexchange.com/questions/243207/what-is-the-proper-usage-of-scale-pos-weight-in-xgboost-for-imbalanced-datasets
I will also test out smote - Synthetic Minority Oversampling technique

Lets also start analyzing the confusion matrix for better insights on what parameters to change

In [8]:
model_trial = XGBClassifier(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
#948/16383 -> neg class to pos class ratio #scale_pos_weight=0.05786
model_trial.fit(X_train, y_train)
results(model_trial,X_train, X_valid, y_train, y_valid)

0.8809746716351832
0.975665337348417
0.960205078125
              precision    recall  f1-score   support

           0       0.92      0.30      0.45       224
           1       0.96      1.00      0.98      3872

    accuracy                           0.96      4096
   macro avg       0.94      0.65      0.72      4096
weighted avg       0.96      0.96      0.95      4096



Tried multiple runs with class weights which did not help much. Improved the recall of minority class but ruined precision by a huge margin. The huge margin is also because maybe the class itself is small in the first place.

In [204]:
#SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42,k_neighbors=100)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

(23126, 24)

In [194]:
model_trial = XGBClassifier(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.3,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
model_trial.fit(X_train_res,y_train_res)
results(model_trial,X_train_res, X_valid, y_train_res, y_valid)

0.8265143060064936
0.9759145550462682
0.940673828125
              precision    recall  f1-score   support

           0       0.44      0.33      0.38       224
           1       0.96      0.98      0.97      3872

    accuracy                           0.94      4096
   macro avg       0.70      0.66      0.68      4096
weighted avg       0.93      0.94      0.94      4096



Oversampling is not making a dent in improving recall. In fact its ruing accuracy as well. Try undersampling and bagging.

In [221]:
from imblearn.ensemble import BalancedBaggingClassifier
model_trial = XGBClassifier(colsample_bytree=0.8,subsample = 0.8,max_depth=9,
                            n_estimators=1500, learning_rate =0.01,
                            colsample_bylevel=0.4,n_jobs=-1,base_score = 0.55,
                            random_state=42)
model_trial_best = XGBClassifier(colsample_bytree=1,subsample = 1,max_depth=4,
                            n_estimators=300, learning_rate =0.05,
                            colsample_bylevel=1,n_jobs=-1,base_score = 0.55,
                            random_state=42,) #Low precision but higher recal (Highest at this time is 0.67)

In [223]:
balanced_bagging = BalancedBaggingClassifier(base_estimator=model_trial_best,
...                                 sampling_strategy='auto',n_estimators=90,
...                                 replacement=True,
...                                 random_state=42)
model_bagged = balanced_bagging.fit(X_train, y_train)
results(model_bagged,X_train, X_valid, y_train, y_valid)

0.843363756272137
0.847155530235208
0.821044921875
              precision    recall  f1-score   support

           0       0.18      0.64      0.28       224
           1       0.98      0.83      0.90      3872

    accuracy                           0.82      4096
   macro avg       0.58      0.74      0.59      4096
weighted avg       0.93      0.82      0.86      4096



Smote and class weights give balanced but low precision and accuracy. Original XGBoost has high accuracy and low recall. Now my balanced bagging gives me the highest recall. Maybe combine the preds?
Lets try that.

In [224]:
model_trial = XGBClassifier(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.012,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
for_bagging = XGBClassifier(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1050, learning_rate =0.012,
                            colsample_bylevel=1,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
balanced_bagging = BalancedBaggingClassifier(base_estimator=for_bagging,
                                 sampling_strategy='auto',n_estimators=25,
                                 replacement=True,
                                   random_state=42)
#SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42,sampling_strategy = 0.4)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

model_smote = XGBClassifier(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.012,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)


model_smote.fit(X_train_res,y_train_res)
model_trial.fit(X_train, y_train)
model_bagged = balanced_bagging.fit(X_train, y_train)

soft_preds_xgb = model_trial.predict_proba(X_valid)
soft_preds_bagged = model_bagged.predict_proba(X_valid)
soft_preds_smote = model_smote.predict_proba(X_valid)

In [219]:
a = 0.5
b=0.5
soft_probs=a*soft_preds_smote + b* soft_preds_bagged
valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
print(valid_auc)

0.8666640532762693


In [220]:
#stacking instead
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
estimators = [
     ('balanced_xgb', balanced_bagging),
     ('simple_xgb',model_trial ),
     ('smote',model_smote ),
 ]
stacked = StackingClassifier(estimators = estimators,
                              cv=5, n_jobs=-1)
stacked.fit(X_train, y_train)
results(stacked,X_train, X_valid, y_train, y_valid)

0.8695533869539551
0.9859200781313584
0.960205078125
              precision    recall  f1-score   support

           0       0.85      0.33      0.48       224
           1       0.96      1.00      0.98      3872

    accuracy                           0.96      4096
   macro avg       0.91      0.66      0.73      4096
weighted avg       0.96      0.96      0.95      4096



In [226]:
stacked.fit(X,y)
testdf = pd.read_csv("test_final.csv")
testdf['f1'].replace(-1,f1_avg, inplace = True)
X_test = testdf.loc[:,'f1':'f24']
test_preds = model_trial.predict_proba(X_test)
sub7stacked = {"Id":testdf['Id'],"Y":test_preds[:,1]}
sub7stacked = pd.DataFrame(data=sub7stacked)
sub7stacked.to_csv("submissions/sub7stacked.csv", index=False)

Balanced bagging log
Using Balanced bagging Improved the recall of minority class although precision took a hit. Used same xgboost params as best trial yet. 
Just making a stacked submissions because why not...no evidence of it being better than simple xgboost. 

DID NOT WORK

## Understanding nature of data with resampling
Goal is to improve precision and recall of minority class (0 class)
Not sure if smote is right for this dataset. Oversampling ruins precision and recall and we are not able to improve on this for now. Need to research further. Try stacking features now.

In [215]:
model_trial = XGBClassifier(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
model_trial.fit(X_train, y_train)
results(model_trial,X_train, X_valid, y_train, y_valid)

0.8809746716351832
0.975665337348417
0.960205078125
              precision    recall  f1-score   support

           0       0.92      0.30      0.45       224
           1       0.96      1.00      0.98      3872

    accuracy                           0.96      4096
   macro avg       0.94      0.65      0.72      4096
weighted avg       0.96      0.96      0.95      4096



## Stacking prediction outputs as new feature for xgboost.


In [216]:
preds_train = model_trial.predict(X_train)
preds_train=np.reshape(preds_train,(-1,1))
X_train_add = np.concatenate((X_train,preds_train),1)
model_trial.fit(X_train_add, y_train)
preds_valid = model_trial

XGBClassifier(base_score=0.55, booster='gbtree', colsample_bylevel=0.5,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.011, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1550, n_jobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.7, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [217]:
results(model_trial,X_train, X_valid, y_train, y_valid)

0.8431700579250295


ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24'] ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24']
expected f0 in input data

In [228]:
params = {
        'n_estimators' : [1550,1600,1650,1800,2000],
        'learning_rate' : [0.011,0.013,0.02,0.015,0.01],
        'colsample_bylevel':[0.4,0.5,0.6,0.7],
        'colsample_bytree':[0.4,0.3,0.5,0.6,0.7,0.8],
        'min_child_weight':[1,2,3],
        'gamma':[0,1,2],
        'base_score':[0.5,0.6,0.7]
        }

In [229]:
model_tune = XGBClassifier(subsample = 0.7,
                           max_depth=8,scale_pos_weight=0.0578,
                           random_state=42)

In [230]:
folds = 6
param_comb = 9 #number of random parameter combos to pick
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search = RandomizedSearchCV(model_tune, param_distributions=params,
                                   n_iter=param_comb, scoring='roc_auc', 
                                   n_jobs=-1, cv=skf.split(X_train,y_train), 
                                   verbose=3, random_state=42 )
start_time = timer(None) # timing starts from this point for "start_time" variable
tuned = random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 6 folds for each of 9 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  5.8min finished



 Time taken: 0 hours 6 minutes and 19.71 seconds.


In [231]:
results(tuned,X_train, X_valid, y_train, y_valid)


0.8695441632231404
0.9850248229836412
0.944091796875


In [232]:
tuned.best_params_

{'n_estimators': 2000,
 'min_child_weight': 1,
 'learning_rate': 0.02,
 'gamma': 0,
 'colsample_bytree': 0.5,
 'colsample_bylevel': 0.4,
 'base_score': 0.7}

Moving to tuning two file. Conclusion of this session: Tried balanced bagging, SMOTE, stacking, class weighing. Time to take our best XGBoost model and ensemble with LGB