In [3]:
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn import decomposition
from sklearn.metrics import log_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV  
import imblearn
from imblearn.over_sampling import RandomOverSampler
from matplotlib import pyplot as plt

import os

Training Data

In [6]:
data = pd.read_csv('train_sample.csv')
from sklearn.model_selection import train_test_split
train_X = data.drop(["target"], axis=1)

le = LabelEncoder()
le.fit(data["target"])
train_y = le.transform(data["target"])

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(train_X.values, train_y):
    X_train = train_X.values[train_index]
    X_val = train_X.values[test_index]

    y_train = train_y[train_index]
    y_val = train_y[test_index]

Check Nulls

In [8]:
data.describe()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
count,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,...,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0,41244.0
mean,0.638275,0.682693,0.765663,0.738532,1.269276,0.139099,0.64019,0.592401,0.273373,0.320313,...,0.261881,0.639148,0.135026,0.538818,0.403792,1.062555,0.33103,0.582339,0.272476,3.836219
std,0.801255,3.29115,2.947488,3.254194,3.05117,0.946615,0.800226,1.600263,1.23747,1.095553,...,1.437176,0.802525,1.225186,1.912552,1.550054,2.47678,1.191886,1.353767,0.767428,2.507611
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
75%,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,6.0
max,6.0,105.0,102.0,80.0,38.0,30.0,7.0,39.0,51.0,30.0,...,40.0,6.0,87.0,52.0,61.0,149.0,31.0,36.0,22.0,8.0


In [9]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_sample(X_train, y_train)

unique, counts = np.unique(y_ros, return_counts=True)

print(np.asarray((unique, counts)).T)

[[   0 8560]
 [   1 8560]
 [   2 8560]
 [   3 8560]
 [   4 8560]
 [   5 8560]
 [   6 8560]
 [   7 8560]
 [   8 8560]]


Now Scale

In [10]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

Test Data

In [11]:
test_data = pd.read_csv('test_sample.csv')
test_X = test_data.drop(["id"], axis=1)
scaler_all = StandardScaler()
train_X_scaled = scaler_all.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

XGBoost

In [12]:
xgb = XGBClassifier()
xgb.fit(X_train_scaled, y_train)
preds = xgb.predict_proba(X_val_scaled)
score = log_loss(y_val, preds)
print("test data log loss eval : {}".format(log_loss(y_val,preds)))

test data log loss eval : 0.513554785143517


In [13]:
xgb.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)>

Fit and Tune

In [15]:
from sklearn.model_selection import GridSearchCV

"""
param_test = {
    'n_estimators': [300],
    'n_jobs': [4], #Number of jobs to run in parallel. -1 means using all processors
}
gsearch = GridSearchCV(estimator = XGBClassifier(), param_grid = param_test, scoring='neg_log_loss', n_jobs=-1,iid=False, cv=3,verbose=1, return_train_score=True)
gsearch.fit(X_train_scaled,y_train)
pd.DataFrame(gsearch.cv_results_)
"""

"\nparam_test = {\n    'n_estimators': [300],\n    'n_jobs': [4], #Number of jobs to run in parallel. -1 means using all processors\n}\ngsearch = GridSearchCV(estimator = XGBClassifier(), param_grid = param_test, scoring='neg_log_loss', n_jobs=-1,iid=False, cv=3,verbose=1, return_train_score=True)\ngsearch.fit(X_train_scaled,y_train)\npd.DataFrame(gsearch.cv_results_)\n"

In [19]:
scores = []
n_estimators = [290,300,310,320,330,340]

for nes in n_estimators:
    xgb = XGBClassifier(learning_rate =0.1, n_estimators=nes, max_depth=7, min_child_weight=3, subsample=0.8, 
                             colsample_bytree=0.8, nthread=4, seed=42, objective='multi:softprob')
    xgb.fit(X_train_scaled, y_train)
    preds = xgb.predict_proba(X_val_scaled)
    score = log_loss(y_val, preds)
    scores.append(score)
    print("test data log loss eval : {}".format(log_loss(y_val,preds)))

test data log loss eval : 0.4975808885703289
test data log loss eval : 0.49753544251501375
test data log loss eval : 0.49723677873658334


KeyboardInterrupt: 

stopped the code and picked the minimum: 300

In [21]:
scores_md = []
max_depths = [6,7,8,10]

for md in max_depths:
    xgb = XGBClassifier(learning_rate =0.1, n_estimators=300, 
                        max_depth=md, min_child_weight=3, subsample=0.8, 
                        colsample_bytree=0.8, nthread=4, seed=42, objective='multi:softprob')
    xgb.fit(X_train_scaled, y_train)
    preds = xgb.predict_proba(X_val_scaled)
    score = log_loss(y_val, preds)
    scores_md.append(score)
    print("test data log loss eval : {}".format(log_loss(y_val,preds)))

test data log loss eval : 0.501659258575704
test data log loss eval : 0.49753544251501375
test data log loss eval : 0.49578044516260555
test data log loss eval : 0.4992140602891157


In [22]:
xgb = XGBClassifier(learning_rate =0.1, n_estimators=300, 
                        max_depth=8, min_child_weight=3, subsample=0.8, 
                        colsample_bytree=0.8, nthread=4, seed=42, objective='multi:softprob')
my_model = CalibratedClassifierCV(xgb, cv=5, method='isotonic')
my_model.fit(train_X_scaled,train_y)
test_preds = my_model.predict_proba(test_X_scaled)
output = pd.DataFrame(test_preds,columns=["Class_"+str(i) for i in range(1,10)])
output.insert(loc=0, column='id', value=test_data.id)
output.to_csv('submission_rg_ml.csv', index=False)