In [75]:
import pandas as pd
import seaborn as sns #For visulisation
import matplotlib.pyplot as plt #For visulisation
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import mutual_info_classif

pd.set_option('display.max_columns', 100) #replace n with the number of columns you want to see completely
pd.set_option('display.max_rows', 100) #replace n with the number of rows you want to see completely

# Read the data
train_data_original = pd.read_csv('./Data/train.csv')
metadata= pd.read_csv('./Data/greeks.csv')
test_data=pd.read_csv('./Data/test.csv')
sample_submission=pd.read_csv('./Data/sample_submission.csv')

In [76]:
import warnings
warnings.filterwarnings('ignore') 

In [77]:
# Isolating Ids and creating train_data
train_data = train_data_original.copy()
ids= train_data.pop('Id')
y=train_data.pop('Class')

#Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
train_data=enc.fit_transform(train_data)

#Split
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.33, random_state=42)

#Impute
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train=imp_mean.fit_transform(X_train)
X_test=imp_mean.fit_transform(X_test);

#Oversample
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

#Scaling
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

features=train_data_original.columns.drop(['Id','Class'])
X_train_resampled_scaled_df=pd.DataFrame(columns=features, data=X_train_resampled_scaled)
X_test_scaled_df = pd.DataFrame(columns=features, data=X_test_scaled)

In [78]:
from sklearn.utils import class_weight

def balanced_log_loss(y_true, y_pred):
    class_weights = class_weight.compute_class_weight('balanced', classes = np.unique(y_true), y = y_true)
    weights = class_weights[y_true.astype(int)]
    loss = log_loss(y_true, y_pred, sample_weight=weights)
    return loss

### Baseline model

In [79]:
baseline_model_XGB=XGBClassifier(max_depth=7, eta=0.4,tree_method='exact',scale_pos_weight=scaleposweight)
baseline_model_XGB.fit(X_train_resampled,y_train_resampled)
baseline_XGB_predictions=baseline_model_XGB.predict(X_test)

In [80]:
print(classification_report(y_test, baseline_XGB_predictions, labels=[0,1]))
print(confusion_matrix(y_test,baseline_XGB_predictions))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       161
           1       0.78      0.88      0.83        43

    accuracy                           0.92       204
   macro avg       0.87      0.91      0.89       204
weighted avg       0.93      0.92      0.92       204

[[150  11]
 [  5  38]]


### W/ class weighting&Cross validation

In [81]:
from sklearn.utils import class_weight
weights = class_weight.compute_sample_weight('balanced', y)

In [82]:
from sklearn.model_selection import cross_validate

#Impute
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X=imp_mean.fit_transform(train_data)

scaleposweight= 509/108

XGB_cross_val=XGBClassifier(max_depth=7, eta=0.4)
cross_val = cross_validate(XGB_cross_val, 
                           X, y, 
                           cv=10, 
                           verbose=1,
                           n_jobs=-1, 
                           return_estimator=True, 
                           return_train_score=True,
                           scoring=['balanced_accuracy','f1_weighted']
                          )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.1s finished


In [83]:
cross_val['test_balanced_accuracy']

array([0.81818182, 0.84402852, 0.81818182, 0.86987522, 0.94474153,
       0.85383244, 0.8083779 , 0.79818182, 0.89019608, 0.85      ])

In [84]:
for i in range(10):
    print('Fold %s: Balanced accuracy=%s , weighted f1=%s' % (i+1, round(cross_val['test_balanced_accuracy'][i],3) ,
                                                             round(cross_val['test_f1_weighted'][i],3)))
    print('---------------------------------------------------')
print('Average balanced acc: ',round(cross_val['test_balanced_accuracy'].mean(),3))
print('Average weighted f1: ',round(cross_val['test_f1_weighted'].mean(),3))

Fold 1: Balanced accuracy=0.818 , weighted f1=0.93
---------------------------------------------------
Fold 2: Balanced accuracy=0.844 , weighted f1=0.918
---------------------------------------------------
Fold 3: Balanced accuracy=0.818 , weighted f1=0.93
---------------------------------------------------
Fold 4: Balanced accuracy=0.87 , weighted f1=0.906
---------------------------------------------------
Fold 5: Balanced accuracy=0.945 , weighted f1=0.968
---------------------------------------------------
Fold 6: Balanced accuracy=0.854 , weighted f1=0.933
---------------------------------------------------
Fold 7: Balanced accuracy=0.808 , weighted f1=0.914
---------------------------------------------------
Fold 8: Balanced accuracy=0.798 , weighted f1=0.898
---------------------------------------------------
Fold 9: Balanced accuracy=0.89 , weighted f1=0.95
---------------------------------------------------
Fold 10: Balanced accuracy=0.85 , weighted f1=0.947
-----------------

### With gridsearch param optimisation

In [24]:
parameters = {
    'eta':[0.05,0.1,0.2,0.3,0.4],
    'max_depth':[4,5,6,7],
    'tree_method':['exact','approx','hist'],
    'booster':['gblinear','gbtree'],
    'min_child_weight':[1,10,20,30]
}

XGB_model= XGBClassifier()
clf = GridSearchCV(XGB_model,
                   parameters,
                   scoring='balanced_accuracy',
                   n_jobs=-1)
clf.fit(X_train_resampled,y_train_resampled)

400 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Nicolas\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Nicolas\anaconda3\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\Nicolas\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1516, in fit
    self._Booster = train(
  File "C:\Users\Nicolas\anaconda3\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\Nicolas\anaconda3\lib\site-packages\xgbo

In [25]:
print(clf.best_params_)
print(clf.best_estimator_)
print(clf.best_score_)
print(clf.scorer_)

{'booster': 'gbtree', 'eta': 0.3, 'max_depth': 4, 'min_child_weight': 1, 'tree_method': 'approx'}
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.3,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', ...)
0.951159420289855
make_scorer(balanced_accuracy_score)


In [31]:
best_XGB_model= XGBClassifier(eta=0.3,
                             max_depth=4,
                             min_child_weight=1,
                             tree_method='approx',
                            booster='gbtree'
                             )
best_XGB_model.fit(X_train_resampled,y_train_resampled)
best_predictions=best_XGB_model.predict(X_test)

print(classification_report(y_test, best_predictions, labels=[0,1]))
print(confusion_matrix(y_test,best_predictions))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       161
           1       0.91      0.93      0.92        43

    accuracy                           0.97       204
   macro avg       0.95      0.95      0.95       204
weighted avg       0.97      0.97      0.97       204



### Selecting features based on importance with RFE

In [85]:
from sklearn.feature_selection import RFE

XGB_classifier_RFE=XGBClassifier(eta=0.3,
                             max_depth=4,
                             min_child_weight=1,
                             tree_method='approx',
                            booster='gbtree'
                             )
rfe = RFE(estimator=XGB_classifier_RFE, n_features_to_select=20, step=1, verbose=1)
rfe.fit(X_train_resampled_scaled_df, y_train_resampled)

Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 fe

In [86]:
print(rfe.support_)
best_features=np.where(rfe.support_)[0]

[ True  True False False False False  True False  True False False False
  True False  True False False  True  True False False  True False False
  True  True False False False False False False  True  True  True False
  True False False False  True  True False False False False False False
  True False False  True False False False  True]


In [87]:
X_test_df = pd.DataFrame(columns=features, data=X_test)
X_train_best_features=X_train_resampled_scaled_df.iloc[:, best_features]
X_test_best_features=X_test_df.iloc[:, best_features]

In [88]:
XGB_Classifier=XGBClassifier(eta=0.3,
                             max_depth=4,
                             min_child_weight=1,
                             tree_method='approx',
                            booster='gbtree'
                             )
XGB_Classifier.fit(X_train_best_features,y_train_resampled)
predictions=XGB_Classifier.predict(X_test_best_features)

print(classification_report(y_test, predictions, labels=[0,1]))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.86      0.32      0.46       161
           1       0.24      0.81      0.37        43

    accuracy                           0.42       204
   macro avg       0.55      0.57      0.42       204
weighted avg       0.73      0.42      0.44       204

[[ 51 110]
 [  8  35]]


In [89]:
XGB_Classifier.feature_importances_

array([0.10622627, 0.02264898, 0.06031855, 0.13106522, 0.0434755 ,
       0.03708456, 0.03080915, 0.03758005, 0.05917044, 0.02934432,
       0.03059536, 0.14414527, 0.02991094, 0.03429962, 0.02447634,
       0.06047499, 0.0300967 , 0.03671928, 0.02707693, 0.02448163],
      dtype=float32)

### Selecting features based on importance with SelectFromModel

In [90]:
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score

XGB_Classifier_sfm=XGBClassifier(eta=0.3,
                             max_depth=4,
                             min_child_weight=1,
                             tree_method='approx',
                            booster='gbtree'
                             )
XGB_Classifier_sfm.fit(X_train_resampled_scaled_df, y_train_resampled)

thresholds = np.sort(XGB_Classifier_sfm.feature_importances_)

best_acc=0
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(XGB_Classifier_sfm, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train_resampled)
    # train model
    selection_model = XGBClassifier(eta=0.3,max_depth=4,min_child_weight=1,tree_method='approx',booster='gbtree')
    selection_model.fit(select_X_train, y_train_resampled)
    # eval model
    select_X_test = selection.transform(X_test_df)
    predictions = selection_model.predict(select_X_test)
    accuracy = balanced_accuracy_score(y_test, predictions)
    if accuracy>best_acc:
        best_acc=accuracy
        best_thresh=thresh
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))

Thresh=0.000, n=56, Accuracy: 97.28%
Thresh=0.000, n=56, Accuracy: 97.28%
Thresh=0.000, n=54, Accuracy: 97.28%
Thresh=0.001, n=53, Accuracy: 97.28%
Thresh=0.002, n=52, Accuracy: 95.27%
Thresh=0.002, n=51, Accuracy: 96.12%
Thresh=0.002, n=50, Accuracy: 96.43%
Thresh=0.003, n=49, Accuracy: 94.34%
Thresh=0.003, n=48, Accuracy: 90.62%
Thresh=0.004, n=47, Accuracy: 90.93%
Thresh=0.005, n=46, Accuracy: 90.93%
Thresh=0.005, n=45, Accuracy: 90.31%
Thresh=0.006, n=44, Accuracy: 90.00%
Thresh=0.006, n=43, Accuracy: 90.00%
Thresh=0.006, n=42, Accuracy: 90.85%
Thresh=0.007, n=41, Accuracy: 90.00%
Thresh=0.007, n=40, Accuracy: 91.78%
Thresh=0.007, n=39, Accuracy: 92.32%
Thresh=0.007, n=38, Accuracy: 92.32%
Thresh=0.008, n=37, Accuracy: 90.62%
Thresh=0.008, n=36, Accuracy: 88.52%
Thresh=0.009, n=35, Accuracy: 87.90%
Thresh=0.009, n=34, Accuracy: 91.70%
Thresh=0.009, n=33, Accuracy: 93.72%
Thresh=0.010, n=32, Accuracy: 93.17%
Thresh=0.011, n=31, Accuracy: 90.23%
Thresh=0.012, n=30, Accuracy: 89.30%
T

In [91]:
threshold=best_thresh
selection = SelectFromModel(XGB_Classifier_sfm, threshold=threshold, prefit=True)
select_X_train = selection.transform(X_train_resampled)
# train model
selection_model = XGBClassifier(eta=0.3,max_depth=4,min_child_weight=1,tree_method='approx',booster='gbtree')
selection_model.fit(select_X_train, y_train_resampled)
# eval model
select_X_test = selection.transform(X_test_df)
predictions = selection_model.predict(select_X_test)

In [92]:
print('Dimensionality before feature selection:',X_train_resampled.shape)
print('Dimenionality after feature selection:',select_X_train.shape)

Dimensionality before feature selection: (696, 56)
Dimenionality after feature selection: (696, 56)


In [93]:
print(classification_report(y_test, predictions, labels=[0,1]))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       161
           1       0.89      0.98      0.93        43

    accuracy                           0.97       204
   macro avg       0.94      0.97      0.96       204
weighted avg       0.97      0.97      0.97       204

[[156   5]
 [  1  42]]


### Ensemble

In [100]:
def get_ensemble_predictions(base_1, base_2, voting, X, y, test):
    ensemble=VotingClassifier(estimators=[
        ('base_1',base_1), ('base_2',base_2)], voting=voting, n_jobs=-1, verbose=True)
    ensemble.fit(X,y)
    predictions=ensemble.predict(test)
    
    return predictions

#### XGBoost and KNN
Soft voting

In [95]:
from sklearn.neighbors import KNeighborsClassifier
GSparameters_KNN = {
    'n_neighbors':[5,7,8,10],
    'weights':['uniform','distance'],
    'algorithm':['ball_tree','kd_tree','brute'],
    'metric': ['minkowski','manhattan']
}

KNN_GSmodel= KNeighborsClassifier()
clf_KNN = GridSearchCV(KNN_GSmodel,
                   GSparameters_KNN,
                   scoring='balanced_accuracy',
                   n_jobs=-1)
clf_KNN.fit(X_train_resampled,y_train_resampled)
clf_KNN.best_params_

{'algorithm': 'ball_tree',
 'metric': 'manhattan',
 'n_neighbors': 8,
 'weights': 'uniform'}

In [96]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

params_knn = {
    'n_neighbors': 8,
    'weights': 'uniform',
    'algorithm':'ball_tree',
    'n_jobs':-1,
    'metric': 'manhattan'
}

params_XGB={
    'eta':0.3,
    'max_depth':4,
    'min_child_weight':1,
    'tree_method':'approx',
    'booster':'gbtree'
}

KNN_ensemble = KNeighborsClassifier(**params_knn)
XGB_ensemble = XGBClassifier(**params_XGB)
ensemble = VotingClassifier(estimators=[
    ('KNN', KNN_ensemble), ('XGB', XGB_ensemble)], voting='soft', n_jobs=-1, verbose=True)

ensemble = ensemble.fit(select_X_train, y_train_resampled)
predictions_ensemble=ensemble.predict(select_X_test)

In [97]:
print(classification_report(y_test, predictions_ensemble, labels=[0,1]))
print(confusion_matrix(y_test,predictions_ensemble))

              precision    recall  f1-score   support

           0       0.99      0.94      0.97       161
           1       0.82      0.98      0.89        43

    accuracy                           0.95       204
   macro avg       0.91      0.96      0.93       204
weighted avg       0.96      0.95      0.95       204

[[152   9]
 [  1  42]]


#### XGBoost and Adaboost and Catboost

In [110]:
from sklearn.ensemble import AdaBoostClassifier
GSparameters_Ada = {
    'n_estimators':[50,80,100,120],
    'learning_rate':[1.0,2.0,3.0],
}

Adaboost_GS= AdaBoostClassifier()
clf_Ada = GridSearchCV(Adaboost_GS,GSparameters_Ada,
                       refit=True,
                       scoring='balanced_accuracy',
                       verbose=2,
                       n_jobs=-1)
clf_Ada.fit(X_train_resampled,y_train_resampled)
clf_Ada.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


{'learning_rate': 1.0, 'n_estimators': 80}

In [112]:
from sklearn.svm import SVC
GSparameters_SVM = {
    'C':[0.1,1,10,100,1000],
    'gamma':['scale','auto'],
    'kernel':['linear','poly','rbf','sigmoid'],
    'class_weight':['balanced']
}

SVC_GS= SVC()
clf_SVC = GridSearchCV(SVC_GS,GSparameters_SVM,
                       refit=True,
                       scoring='balanced_accuracy',
                       verbose=2,
                       n_jobs=-1)
clf_SVC.fit(X_train_resampled,y_train_resampled)

print(clf_SVC.best_params_)
print(clf_SVC.best_score_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


{'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}

In [138]:
from sklearn.utils import class_weight
from catboost import Pool, CatBoostClassifier

params_Ada = {
    'learning_rate': 1.0, 
    'n_estimators': 80
}


base_XGB=XGBClassifier(**params_XGB)
base_Ada=AdaBoostClassifier(**params_Ada)
base_CatBoost=CatBoostClassifier()

ensemble_2 = VotingClassifier(estimators=[
     ('XGB', base_XGB), ('Adaboost', base_Ada), ('CatBoost', base_CatBoost)], voting='soft', n_jobs=-1, verbose=True)

ensemble_2 = ensemble_2.fit(select_X_train, y_train_resampled)
predictions_ensemble=ensemble_2.predict(select_X_test)

In [139]:
print(classification_report(y_test, predictions_ensemble, labels=[0,1]))
print(confusion_matrix(y_test,predictions_ensemble))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       161
           1       0.89      0.98      0.93        43

    accuracy                           0.97       204
   macro avg       0.94      0.97      0.96       204
weighted avg       0.97      0.97      0.97       204

[[156   5]
 [  1  42]]


#### XGBoost and TabFPN


In [None]:
from tabpfn import TabPFNClassifier
base_tabFPN = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)

ensemble_3 = VotingClassifier(estimators=[
     ('XGB', base_XGB), ('TabFPN', base_tabFPN)], voting='soft', n_jobs=-1, verbose=True)

ensemble_3 = ensemble_3.fit(select_X_train, y_train_resampled)
predictions_ensemble=ensemble_3.predict(select_X_test)

TODO: Use the times in metadata like https://www.kaggle.com/code/scipygaurav/icr-improved-tabpfn-xgb-lb-0-11#Loading-data

In [136]:
from datetime import datetime
times = metadata.Epsilon.copy()
times[metadata.Epsilon != 'Unknown'] = metadata.Epsilon[metadata.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[metadata.Epsilon == 'Unknown'] = np.nan

In [137]:
times

0      737137.0
1           NaN
2           NaN
3           NaN
4      737509.0
         ...   
612    737681.0
613    737676.0
614    737264.0
615    737090.0
616         NaN
Name: Epsilon, Length: 617, dtype: object

In [None]:
train_pred_and_time = pd.concat((select_X_train, times), axis=1)
test_predictors = np.array(test_df[predictor_columns])
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + 
                                     train_pred_and_time.Epsilon.max() + 1), axis=1)