
### The data path in the file is for running on local machine
### Same path were changed when Notebook was run on kaggle
### To run file will need to change path accordingly







```
For choosing best set of hyperparameters:
1. train_median_with_cols.csv
2. test_median_with_cols.csv
the above data sets were used (have some missing values)

For final training and predictions on Kaggle test set:
1. final_training_set.csv
2. final_testing_set.csv
the above data sets were used (have imputed values)
```



In [1]:
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
import numpy as np
import pandas as pd
import sklearn
from numpy import mean
from numpy import std
from numpy import arange
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_validation, metrics
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import f1_score, make_scorer


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_set = pd.read_csv('../content/drive/My Drive/Machine Learning Course Project/data/train_median_with_cols.csv')
test_set = pd.read_csv('../content/drive/My Drive/Machine Learning Course Project/data/test_median_with_cols.csv')

In [4]:
# Must remember to change later
train_set['Target'] = train_set['Target'] - 1

In [5]:
# Shape
print('Shape of train: ', train_set.shape)
print('Shape of test: ', test_set.shape)

# Checking if there are any missing values 
print('Number of NaN values in train:', len(train_set.columns[train_set.isna().any()].tolist()))
print('Number of NaN values in test: ',len(test_set.columns[test_set.isna().any()].tolist()))
# The missing valu column in test in the Target column- so that is okay

Shape of train:  (2973, 231)
Shape of test:  (7334, 231)
Number of NaN values in train: 0
Number of NaN values in test:  1


##Deciding Hyperparameters

Steps we will follow:

1. Fix certain parameters like learning rate and number of trees and vary tree_specific paramter max depth 
2. Fix certain parameters like learning rate and number of trees and vary tree_specific paramter min_child weight

NOTE: For cross validation the metric used is Accuracy and not F1 macro score since the xgb API does not provide F1 macro score

In [6]:
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')

In [7]:
# define number of trees to consider
def get_models_vary_no_trees():
  models = dict()
  opt_parameters_1 = {'max_depth':5, 'learning_rate':0.1, 'silent':1, "n_estimators": 10, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4}
  opt_parameters_2 = {'max_depth':5, 'learning_rate':0.1, 'silent':1, "n_estimators": 20, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4 }
  opt_parameters_3 = {'max_depth':5, 'learning_rate':0.1, 'silent':1, "n_estimators": 30, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4 }
  opt_parameters_4 = {'max_depth':5, 'learning_rate':0.1, 'silent':1, "n_estimators": 40, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4 }
  opt_parameters_5 = {'max_depth':5, 'learning_rate':0.1, 'silent':1, "n_estimators": 50, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4}
  opt_parameters_6 = {'max_depth':10, 'learning_rate':0.1, 'silent':1, "n_estimators": 10, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4}
  opt_parameters_7 = {'max_depth':10, 'learning_rate':0.1, 'silent':1, "n_estimators": 20, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4 }
  opt_parameters_8 = {'max_depth':10, 'learning_rate':0.1, 'silent':1, "n_estimators": 30, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4 }
  opt_parameters_9 = {'max_depth':10, 'learning_rate':0.1, 'silent':1, "n_estimators": 40, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4 }
  opt_parameters_10 = {'max_depth':10, 'learning_rate':0.1, 'silent':1, "n_estimators": 50, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4}

  xgb_param_list = [opt_parameters_1,  opt_parameters_2,  opt_parameters_3, opt_parameters_4, opt_parameters_5, opt_parameters_6,  opt_parameters_7,  opt_parameters_8, opt_parameters_9, opt_parameters_10]
  for i in range(10):
    models['max_depth: '+str(xgb_param_list[i]["max_depth"])+'  n_estimators: '+str(xgb_param_list[i]["n_estimators"])] = xgb.XGBClassifier(random_state=217, **xgb_param_list[i])
  return models


In [8]:
def evaluate_model(model, X, y):
	# define the evaluation procedure
	cv = StratifiedKFold(n_splits=7, shuffle=False)
	# evaluate the model and collect the results
	scores = cross_val_score(model, X, y, scoring=scorer, cv=cv, n_jobs=-1)
	return scores

In [9]:
labels = train_set['Target']
train = train_set[train_set['Target'].notnull()].drop(columns = ['Id', 'idhogar', 'Unnamed: 0', 'Target']).copy()
test = test_set[test_set['Target'].isnull()].drop(columns = ['Id', 'idhogar', 'Target', 'Unnamed: 0']).copy()

In [None]:
models = get_models_vary_no_trees()
results, names = list(), list()
for name, model in models.items():
	# evaluate the model
	scores = evaluate_model(model, train, labels)
	# store the results
	results.append(scores)
	names.append(name)
	# summarize the performance along the way
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>max_depth: 5  n_estimators: 10 0.276 (0.035)
>max_depth: 5  n_estimators: 20 0.269 (0.035)
>max_depth: 5  n_estimators: 30 0.253 (0.034)
>max_depth: 5  n_estimators: 40 0.233 (0.025)
>max_depth: 5  n_estimators: 50 0.214 (0.029)
>max_depth: 10  n_estimators: 10 0.269 (0.034)
>max_depth: 10  n_estimators: 20 0.245 (0.038)
>max_depth: 10  n_estimators: 30 0.252 (0.039)
>max_depth: 10  n_estimators: 40 0.242 (0.038)
>max_depth: 10  n_estimators: 50 0.231 (0.026)


Optimum was found to be max_depth = 5, n_features = 10

In [None]:
# See model_training_xgboost
opt_parameters = {'max_depth':5, 'learning_rate':0.1, 'silent':1, "n_estimators": 10, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4}
model = xgb.XGBClassifier(random_state=217, **opt_parameters)

In [13]:
def readCSV():
    train = pd.read_csv('../content/drive/My Drive/Machine Learning Course Project/data/final_training_set.csv')
    test = pd.read_csv('../content/drive/My Drive/Machine Learning Course Project/data/final_testing_set.csv')
    train = train.drop(train.columns[0], axis=1)
    test = test.drop(test.columns[0], axis=1)

    train.drop(columns=['idhogar','Id'], inplace=True)
    test.drop(columns=['idhogar','Id','Target'], inplace=True)
    return train, test

In [14]:
def model_training_xgboost(train,test):
    #parameter value is copied from 
    y = train['Target']
    train.drop(columns=['Target'], inplace=True)

    opt_parameters = {'max_depth':5, 'learning_rate':0.1, 'silent':1, "n_estimators": 10, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4}
#     opt_parameters = opt_parameters_4 = {'max_depth':35, 'learning_rate':0.13, 'silent':1, "n_estimators": 350, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.88, 'reg_lambda': 0.325 }
#     opt_parameters = {'colsample_bytree': 0.88, 'min_child_samples': 90, 'num_leaves': 25, 'subsample': 0.94, 'reg_lambda': 0.5, }
    clf = xgb.XGBClassifier(random_state=217, **opt_parameters)
    
    kfold = 7
    kf = StratifiedKFold(n_splits=kfold, shuffle=True)

    predicts_result = []
    for train_index, test_index in kf.split(train, y):
        print("###")
        X_train, X_val = train.iloc[train_index], train.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        clf.fit(X_train, y_train, eval_set=[(X_val, y_val)],early_stopping_rounds=100)
        predicts_result.append(clf.predict(test))
    
    return (predicts_result,clf)

In [15]:
train, test = readCSV()

predicts_result,clf = model_training_xgboost(train,test)
results = pd.read_csv('../content/drive/My Drive/Machine Learning Course Project/data/final_testing_set.csv')
results['Target'] = np.array(predicts_result).mean(axis=0).round().astype(int)
results = results[['idhogar','Target']].copy()
results.to_csv('intermediate.csv', index = False)
test = pd.read_csv('../content/drive/My Drive/Machine Learning Course Project/data/test.csv')
test = test[['Id','idhogar']].copy()
submission = pd.merge(test,results, on='idhogar', how='outer')
submission.fillna(4, inplace=True)
submission.drop(columns='idhogar', inplace= True)
submission = submission.astype({'Target': int})
submission.to_csv('submission.csv', index = False)

###
[0]	validation_0-merror:0.338824
Will train until validation_0-merror hasn't improved in 100 rounds.
[1]	validation_0-merror:0.331765
[2]	validation_0-merror:0.343529
[3]	validation_0-merror:0.315294
[4]	validation_0-merror:0.334118
[5]	validation_0-merror:0.32
[6]	validation_0-merror:0.317647
[7]	validation_0-merror:0.312941
[8]	validation_0-merror:0.312941
[9]	validation_0-merror:0.317647
###
[0]	validation_0-merror:0.324706
Will train until validation_0-merror hasn't improved in 100 rounds.
[1]	validation_0-merror:0.317647
[2]	validation_0-merror:0.334118
[3]	validation_0-merror:0.32
[4]	validation_0-merror:0.317647
[5]	validation_0-merror:0.324706
[6]	validation_0-merror:0.327059
[7]	validation_0-merror:0.329412
[8]	validation_0-merror:0.322353
[9]	validation_0-merror:0.331765
###
[0]	validation_0-merror:0.352941
Will train until validation_0-merror hasn't improved in 100 rounds.
[1]	validation_0-merror:0.355294
[2]	validation_0-merror:0.362353
[3]	validation_0-merror:0.348235


7