In [29]:
import pandas as pd
import numpy as np
import os
import re

In [30]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import *

In [31]:
def model_metrics(y_pred, y_truth):
    print('Accuracy: {:.4f}'.format(metrics.accuracy_score(y_truth, y_pred)))
    print('Precision: {:.4f}'.format(metrics.precision_score(y_truth, y_pred)))
    print('Recall: {:.4f}'.format(metrics.recall_score(y_truth, y_pred)))
    print('F1: {:.4f}'.format(metrics.f1_score(y_truth, y_pred)))

In [32]:
train_df = pd.read_csv(r"D:/Hackathons/Glass_Quality_Participants_Data/Train.csv")
test_df = pd.read_csv(r"D:/Hackathons/Glass_Quality_Participants_Data/Test.csv")

In [33]:
print("Shape of Train:", train_df.shape)
print("Shape of Test:", test_df.shape)

Shape of Train: (1358, 16)
Shape of Test: (583, 15)


In [6]:
train_df['xmean'] = (train_df['xmin'] + train_df['xmax'])/2
train_df['ymean'] = (train_df['ymin'] + train_df['ymax'])/2
test_df['xmean'] = (test_df['xmin'] + test_df['xmax'])/2
test_df['ymean'] = (test_df['ymin'] + test_df['ymax'])/2

In [34]:
train_df.columns

Index(['grade_A_Component_1', 'grade_A_Component_2', 'max_luminosity',
       'thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area',
       'x_component_1', 'x_component_2', 'x_component_3', 'x_component_4',
       'x_component_5', 'class'],
      dtype='object')

In [35]:
test_df.columns

Index(['grade_A_Component_1', 'grade_A_Component_2', 'max_luminosity',
       'thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area',
       'x_component_1', 'x_component_2', 'x_component_3', 'x_component_4',
       'x_component_5'],
      dtype='object')

In [36]:
train_df['class'].value_counts()

1    887
2    471
Name: class, dtype: int64

In [37]:
target = train_df['class']

In [38]:
X = train_df.drop(['class'], axis = 1)
y = train_df['class']

In [39]:
from sklearn import preprocessing
standardized_X = preprocessing.scale(X)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(standardized_X, y, test_size = 0.33)

In [41]:
y_train.value_counts(), 307/602

(1    591
 2    318
 Name: class, dtype: int64, 0.5099667774086378)

In [16]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_sample(X_train, y_train)
print('Original dataset shape %s' % Counter(y_ros))

Using TensorFlow backend.


Original dataset shape Counter({1: 594, 2: 594})


In [44]:
import xgboost as xgb
xgb_IF = xgb.XGBClassifier(scale_pos_weight = 1)
xgb_IF.fit(X_train, y_train)
XGB_Train_Preds = xgb_IF.predict(X_train)
XGB_Test_Preds = xgb_IF.predict(X_test)
print("====Train Metrics====")
print(model_metrics(XGB_Train_Preds, y_train))
print("====Test Metrics====")
print(model_metrics(XGB_Test_Preds, y_test))

====Train Metrics====
Accuracy: 0.8823
Precision: 0.9276
Recall: 0.8883
F1: 0.9075
None
====Test Metrics====
Accuracy: 0.8597
Precision: 0.9267
Recall: 0.8547
F1: 0.8893
None


In [15]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
xgb = xgb.XGBClassifier(objective='binary:logistic')

folds = 5
param_comb = 50

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

params = {
        'base_score':[0.5, 0.6],
        'min_child_weight': [1, 5, 10],
        'gamma': [1, 1.5, 2, 2.5, 3],
        'learning_rate': [0.01,0.1, 0.125],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'max_depth': [6, 8, 12, 22],
        'n_estimators': [100, 150, 250],
        #'scale_pos_weight' :[0.5, 0.6, 1],
        'eval_metric' :['logloss'],
        'reg_alpha': [0, 1], 
        'reg_lambda': [0, 1]
        }

random_search = RandomizedSearchCV(xgb, param_distributions = params, n_iter = param_comb, scoring = 'roc_auc',
                                   cv=skf.split(X_ros, y_ros), verbose=1, random_state=43)

In [16]:
from time import time
start = time()
random_search.fit(X_ros, y_ros)
end = time()
print ("This took %.2f seconds" % (end - start))

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  3.0min finished


This took 183.79 seconds


In [17]:
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)


 Best estimator:
XGBClassifier(base_score=0.6, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, eval_metric='logloss',
              gamma=1, learning_rate=0.01, max_delta_step=0, max_depth=22,
              min_child_weight=1, missing=None, n_estimators=150, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.7, verbosity=1)

 Best normalized gini score for 5-fold search with 50 parameter combinations:
0.9485538328438405

 Best hyperparameters:
{'subsample': 0.7, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 150, 'min_child_weight': 1, 'max_depth': 22, 'learning_rate': 0.01, 'gamma': 1, 'eval_metric': 'logloss', 'colsample_bytree': 0.7, 'base_score': 0.6}


In [18]:
import xgboost as xgb
xgb_IF = xgb.XGBClassifier(subsample = 0.7, n_estimators = 150, min_child_weight = 1, max_depth = 22, 
                        learning_rate = 0.01, gamma = 1, colsample_bytree = 0.7, eval_metric = 'logloss', base_score = 0.6,
                         reg_lambda = 0, reg_alpha = 0)
xgb_IF.fit(X_ros, y_ros)
XGB_Train_Preds = xgb_IF.predict(X_ros)
XGB_Test_Preds = xgb_IF.predict(X_test)
print("====Train Metrics====")
print(model_metrics(XGB_Train_Preds, y_ros))
print("====Test Metrics====")
print(model_metrics(XGB_Test_Preds, y_test))

====Train Metrics====
Accuracy: 0.9925
Precision: 1.0000
Recall: 0.9851
F1: 0.9925
None
====Test Metrics====
Accuracy: 0.8374
Precision: 0.8860
Recall: 0.8516
F1: 0.8685
None


In [45]:
log_loss(XGB_Test_Preds, y_test)

21.000677607815277

In [46]:
print(classification_report(XGB_Test_Preds, y_test, [1,2]))

              precision    recall  f1-score   support

           1       0.85      0.93      0.89       273
           2       0.87      0.76      0.81       176

    accuracy                           0.86       449
   macro avg       0.86      0.84      0.85       449
weighted avg       0.86      0.86      0.86       449



In [20]:
y_test.value_counts()

1    293
2    156
Name: class, dtype: int64

In [21]:
y_train.value_counts()

1    594
2    315
Name: class, dtype: int64

#### Predictions on test

In [47]:
standardized_test = preprocessing.scale(test_df)

In [48]:
Test_Preds = xgb_IF.predict_proba(standardized_test)

In [49]:
Results = pd.DataFrame(Test_Preds, columns=['1', '2'])

In [50]:
Results.to_excel("XGB_Default_OS1.xlsx", index = False)

In [85]:
Results.head()

Unnamed: 0,1,2
0,0.994018,0.005982
1,0.05594,0.94406
2,0.989115,0.010885
3,0.992945,0.007055
4,0.688947,0.311053


In [86]:
# v1 PL: 0.26284 Log Loss: 22.462263228872022 
#xgb_IF = xgb.XGBClassifier(subsample = 0.9, scale_pos_weight = 1, n_estimators = 250, min_child_weight = 1, max_depth = 8, 
#                        learning_rate = 0.1, gamma = 2.5, colsample_bytree = 0.8)
# v2 PL: 0.28173 Log Loss: 23.XXXX


In [None]:
#subsample = 0.9, scale_pos_weight = 1, n_estimators = 250, min_child_weight = 1, max_depth = 8, 
#                        learning_rate = 0.1, gamma = 2, colsample_bytree = 0.9
# 22.077635433857086
# 0.27168

In [None]:
# v4 
# xmean, ymean
#subsample = 0.9, scale_pos_weight = 1, n_estimators = 250, min_child_weight = 1, max_depth = 12, 
#learning_rate = 0.01, gamma = 2, colsample_bytree = 0.8, eval_metric = 'logloss'
# logloss: 21.769933197845145
# LB: 0.31670

In [None]:
# XGB_Scale_PT_v5.xlsx
# Oversampling
# Xmean and ymean
# PL 0.30803

In [None]:
# XGB_Scale_PT_v6.xlsx
# Oversampling
# PL 0.30073
{'subsample': 0.7, 'n_estimators': 250, 'min_child_weight': 1, 'max_depth': 22, 'learning_rate': 0.1, 
 'gamma': 1, 'eval_metric': 'logloss', 'colsample_bytree': 0.8, 'base_score': 0.6}

In [None]:
# XGB_Scale_PT_v7.xlsx PL 0.29093
# Xmean and ymean
{'subsample': 0.9, 'scale_pos_weight': 1, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 12, 'learning_rate': 0.125, 'gamma': 3, 
 'eval_metric': 'logloss', 'colsample_bytree': 0.8, 'base_score': 0.6}

In [None]:
#XGB_Scale_PT_v8 PPL 0.28471
# Original - no added features
{'subsample': 0.9, 'scale_pos_weight': 1, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 12, 'learning_rate': 0.125, 'gamma': 3,
 'eval_metric': 'logloss', 'colsample_bytree': 0.8, 'base_score': 0.6}

# XGB_Scale_PT_v9  PL 0.31666
# Added features, ROS
{'subsample': 0.7, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 150, 'min_child_weight': 1, 'max_depth': 22, 'learning_rate': 0.01, 'gamma': 1, 
 'eval_metric': 'logloss', 'colsample_bytree': 0.7, 'base_score': 0.6}