In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

In [62]:
def average(df):
    sum = 0
    count = 0
    for x in df['f1']:
        if x!=-1:
            sum+=x
            count+=1
    avg = sum/count
    return avg


In [63]:
def results(model, X_train, X_valid,y_train, y_valid):
    soft_probs = model.predict_proba(X_valid)
    valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
    print(valid_auc)
    train_preds = model.predict(X_train)
    acc = accuracy_score(y_train, train_preds)
    print(acc)
    preds = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    print(acc)
    print(classification_report(y_valid,preds))

In [64]:
#Timer function
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [65]:
df = pd.read_csv("train_final.csv")
df.describe()
f1_avg = average(df)
df['f1'].replace(-1,f1_avg, inplace = True)
df.describe()

Unnamed: 0,Id,Y,f1,f2,f3,f4,f5,f6,f7,f8,...,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24
count,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,...,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0
mean,8192.0,0.942135,43031.41572,1.044375,11.770938,118323.581456,1.044436,0.050052,117089.674113,169730.1786,...,25894.316914,119045.099005,184622.040835,1.047305,125959.667765,1.044558,1.045718,1.041934,32718.9,1.043948
std,4729.509065,0.233495,33596.053696,0.264806,353.187115,4518.059755,0.265601,0.293892,10261.29297,69396.677853,...,36086.993946,18321.987129,100590.811845,0.306239,31091.344158,0.262576,0.266874,0.246597,3184929.0,0.25964
min,1.0,0.0,37.0,1.0,1.77,23779.0,1.0,0.0,4292.0,4673.0,...,25.0,4674.0,3130.0,1.0,117879.0,1.0,1.0,1.0,1.0,1.0
25%,4096.5,1.0,20331.0,1.0,1.77,118096.0,1.0,0.0,117961.0,117906.0,...,4554.0,118395.0,118398.0,1.0,118274.0,1.0,1.0,1.0,1.0,1.0
50%,8192.0,1.0,35530.0,1.0,1.77,118300.0,1.0,0.0,117961.0,128130.0,...,13234.0,118929.0,119095.0,1.0,118568.0,1.0,1.0,1.0,2.0,1.0
75%,12287.5,1.0,74240.5,1.0,3.54,118386.0,1.0,0.0,117961.0,234498.5,...,38902.0,120539.0,290919.0,1.0,120006.0,1.0,1.0,1.0,9.0,1.0
max,16383.0,1.0,312152.0,7.0,43910.16,286791.0,9.0,10.0,311178.0,311867.0,...,311696.0,286792.0,308574.0,18.0,311867.0,8.0,8.0,7.0,404288600.0,8.0


In [66]:
y = df.loc[:,'Y']
X = df.loc[:,'f1':'f24']
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size = 0.75, test_size = 0.25,random_state = 42, shuffle = True)

Best submitted params:
(colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)
  (colsample_bytree=0.4,subsample = 1,max_depth=7,
                            n_estimators=1350, learning_rate =0.012,
                            colsample_bylevel=0.6,n_jobs=-1,base_score = 0.55,
                            random_state=42)      

In [21]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42,k_neighbors=40)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

In [22]:
model_trial = XGBClassifier (colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=155, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,)  
model_trial.fit(X_train_smote,y_train_smote)
results(model_trial,X_train_smote, X_valid, y_train_smote, y_valid)

0.8468272671930342
0.9844763469687797
0.948486328125
              precision    recall  f1-score   support

           0       0.54      0.37      0.44       224
           1       0.96      0.98      0.97      3872

    accuracy                           0.95      4096
   macro avg       0.75      0.67      0.71      4096
weighted avg       0.94      0.95      0.94      4096



In [23]:
#try Near Miss undersampling
from imblearn.under_sampling import NearMiss


In [42]:
model_nearmiss = XGBClassifier (colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,) 
nm = NearMiss()
X_train_res,y_train_res = nm.fit_resample(X_train,y_train)
model_nearmiss.fit(X_train_res,y_train_res)
results(model_nearmiss,X_train_res, X_valid, y_train_res, y_valid)

0.6545124797077922
0.9903314917127072
0.2666015625
              precision    recall  f1-score   support

           0       0.07      0.93      0.12       224
           1       0.98      0.23      0.37      3872

    accuracy                           0.27      4096
   macro avg       0.52      0.58      0.25      4096
weighted avg       0.93      0.27      0.36      4096



Recall for minority class is highest here. Maybe try combining this with XGBoost preds??

In [53]:
model_trial = XGBClassifier (colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.65,
                            random_state=42,)
model_trial.fit(X_train,y_train)
preds1 = model_nearmiss.predict_proba(X_valid)
preds2 = model_trial.predict_proba(X_valid)
a=0.03
b=0.96
preds = a*preds1+b*preds2
valid_auc = roc_auc_score(y_valid, preds[:,1])
print(valid_auc)

0.8835976700855963


In [29]:
#Just testing stacking two similar best models...not an improvement.

model_trial = XGBClassifier (colsample_bytree=0.3,subsample = 0.7,max_depth=8,
                            n_estimators=1550, learning_rate =0.011,
                            colsample_bylevel=0.5,n_jobs=-1,base_score = 0.55,
                            random_state=42,) 
model_trail_other = XGBClassifier (colsample_bytree=0.4,subsample = 1,max_depth=7,
                            n_estimators=1350, learning_rate =0.012,
                            colsample_bylevel=0.6,n_jobs=-1,base_score = 0.55,
                            random_state=42)  
estimators = [
     ('simple_xgb',model_trial ),
     ('nearmiss',model_nearmiss ),
 ]
stacked = StackingClassifier(estimators = estimators,
                              cv=5, n_jobs=-1)
stacked.fit(X_train, y_train)
results(stacked,X_train, X_valid, y_train, y_valid)

0.8771387525826446
0.9868967201106861
0.96044921875
              precision    recall  f1-score   support

           0       0.87      0.33      0.47       224
           1       0.96      1.00      0.98      3872

    accuracy                           0.96      4096
   macro avg       0.92      0.66      0.73      4096
weighted avg       0.96      0.96      0.95      4096



In [30]:
#Try RUSBoostCLassifier

In [67]:
from imblearn.ensemble import RUSBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

In [68]:

base_estimator = AdaBoostClassifier(n_estimators = 10)
model_ada = RUSBoostClassifier(base_estimator=base_estimator,n_estimators = 50,learning_rate=1,
                               replacement=False,random_state=42)
model_ada.fit(X_train, y_train)
results(model_ada,X_train, X_valid, y_train, y_valid)

0.8191099560950413
0.7622690648653048
0.752197265625
              precision    recall  f1-score   support

           0       0.14      0.69      0.23       224
           1       0.98      0.76      0.85      3872

    accuracy                           0.75      4096
   macro avg       0.56      0.72      0.54      4096
weighted avg       0.93      0.75      0.82      4096

