In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score, confusion_matrix

Best models by h2o:
- GBM 

Best models by auto-sklearn:
- sklearn_classifier': RandomForestClassifier(bootstrap=False, max_features=126, min_samples_leaf=13,
                         min_samples_split=19, n_estimators=512, n_jobs=1,
                         random_state=1, warm_start=True)

- sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                                 l2_regularization=3.609412172481434e-10,
                                 learning_rate=0.05972079854295879, max_iter=512,
                                 max_leaf_nodes=4, min_samples_leaf=2,
                                 n_iter_no_change=14, random_state=1,
                                 validation_fraction=None, warm_start=True)

sklearn_classifier': RandomForestClassifier(criterion='entropy', max_features=212,
                         min_samples_leaf=9, min_samples_split=18,
                         n_estimators=512, n_jobs=1, random_state=1,
                         warm_start=True)}
-------- many other RandomForests

sklearn_classifier': LinearSVC(C=1.1311491877789248, dual=False, intercept_scaling=1.0,
            random_state=1, tol=0.0024931733683028904)}

sklearn_classifier': MLPClassifier(activation='tanh', alpha=1.0207902638359607e-07, beta_1=0.999,
                beta_2=0.9, hidden_layer_sizes=(229,),
                learning_rate_init=0.0005867144594731152, max_iter=256,
                n_iter_no_change=32, random_state=1, validation_fraction=0.0,
                verbose=0, warm_start=True)}

Best models by MLJAR:
## Random Forest
- **n_jobs**: -1
- **criterion**: gini
- **max_features**: 1.0
- **min_samples_split**: 40
- **max_depth**: 7
- **eval_metric_name**: accuracy
- **explain_level**: 0


## Extreme Gradient Boosting (Xgboost)
- **n_jobs**: -1
- **objective**: binary:logistic
- **eta**: 0.05
- **max_depth**: 8
- **min_child_weight**: 5
- **subsample**: 0.5
- **colsample_bytree**: 0.8
- **eval_metric**: accuracy
- **explain_level**: 0




In [2]:
# Data was saved by feature engineering notebook
# There are two sets of data, with a bit different features engineering. I will try both

df_log = pd.read_csv('.//data//prepared_train_log.csv')
df_val_log = pd.read_csv('.//data//prepared_test_log.csv')

df_bin = pd.read_csv('.//data//prepared_train_bin.csv')
df_val_bin = pd.read_csv('.//data//prepared_test_bin.csv')

Start with logarifmic transformation of billing features

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df_log[df_log.columns[:-1]], df_log["Transported"], test_size=0.25, random_state=11)

In [4]:
# First try XGBoost with parameters, suggested by MLJAR
import xgboost as xgb

def display_scores(scores):
    print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))

def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [5]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eta=0.05, max_depth=8, 
                               min_child_weight=5, subsample=0.5, colsample_bytree=0.8, eval_metric="error", random_state=11)

xgb_model.fit(X_train, y_train)

In [6]:
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

print(f'Train accuracy score: {accuracy_score(y_train, y_train_pred)}')
print(confusion_matrix(y_train, y_train_pred))
print(f'Test accuracy score: {accuracy_score(y_test, y_test_pred)}')
print(confusion_matrix(y_test, y_test_pred))

Train accuracy score: 0.8671575394999232
[[2876  403]
 [ 463 2777]]
Test accuracy score: 0.8072677092916284
[[837 199]
 [220 918]]


Try the same with binned data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_bin[df_bin.columns[:-1]], df_bin["Transported"], test_size=0.25, random_state=11)

In [8]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eta=0.05, max_depth=8, 
                               min_child_weight=5, subsample=0.5, colsample_bytree=0.8, eval_metric="error", random_state=11)

xgb_model.fit(X_train, y_train)

In [9]:
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

print(f'Train accuracy score: {accuracy_score(y_train, y_train_pred)}')
print(confusion_matrix(y_train, y_train_pred))
print(f'Test accuracy score: {accuracy_score(y_test, y_test_pred)}')
print(confusion_matrix(y_test, y_test_pred))

Train accuracy score: 0.8559595029912563
[[2902  377]
 [ 562 2678]]
Test accuracy score: 0.796228150873965
[[856 180]
 [263 875]]


To check both again with cross-validation (10)

Then try two more models and tune this