# DSCI-598 Capstone
## Maryville University
### November - December 2023
### Alison Hawke

## XGBoost model parameter tuning

https://www.datacamp.com/tutorial/xgboost-in-python

In [None]:
import numpy as np 
import pandas as pd 
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')

In [None]:
X = train.drop(columns = ['Id', 'Cover_Type']) 

# Change cover types from 1-7 to 0-6
# Set as categorical variable
y = train['Cover_Type'].apply(lambda x: x - 1).astype('category')

print("Pre-split training features shape:", X.shape)
print("Pre-split training label shape:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)

print("Post-split feature shape:", X_train.shape)
print("Post-split label shape:", y_train.shape)

# XGBoost Classifier

In [None]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical = True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical = True)

In [None]:
# Seven cover types
params = {'objective': 'multi:softprob', 'num_class': 7}
n = 100

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round = n,
   nfold = 5,
   metrics = ['mlogloss', 'auc', 'merror'],
)

In [None]:
print(results)

In [None]:
results.keys()

In [None]:
results['test-auc-mean'].max()

In [None]:
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
params = {'objective': 'multi:softprob', 'num_class': 7, 'n_estimators': 100, 
          'max_depth': 5, 'learning_rate': 0.1}

xgb_clf = xgb.XGBClassifier(**params)
xgb_clf = xgb_clf.fit(X_train, y_train)

In [None]:
#print(xgb_clf.predict(X_test))
#print(xgb_clf.predict_proba(X_test))

y_pred = xgb_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Parameter exploration

params = {'objective': 'multi:softprob', 'num_class': 7}

* Accuracy: 85.45%
* Submission: 0.72084

params = {'objective': 'multi:softprob', 'num_class': 7, 'n_estimators': 100,
'max_depth': 3, 'learning_rate': 0.1}

* Accuracy: 76.06%

params = {'objective': 'multi:softprob', 'num_class': 7, 'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.1}

* Accuracy: 86.03%

params = {'objective': 'multi:softprob', 'num_class': 7, 'n_estimators' :1000,
'max_depth' :5, 'learning_rate' :0.1, 'min_child_weight' :1, 'gamma' :0, 'subsample' :0.8, 'colsample_bytree' :0.8, 'nthread' :4}

* Accuracy: 86.14%

**params = {'objective': 'multi:softprob', 'num_class': 7, 'n_estimators': 1000, 'max_depth': 30, 'learning_rate': 0.1}**

* Accuracy: 87.20%
* Submission: 0.74676

From documention, the [default parameters for XGBClassifier](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier) are:

* max_depth = 3
* learning_rate = 0.1
* n_estimators = 100
* silent = True
* objective = 'binary:logistic'
* booster = 'gbtree'
* n_jobs = 1
* nthread = None
* gamma = 0
* min_child_weight = 1
* max_delta_step = 0
* subsample = 1
* colsample_bytree = 1
* colsample_bylevel = 1
* reg_alpha = 0
* reg_lambda = 1
* scale_pos_weight = 1
* base_score = 0.5
* random_state = 0
* seed = None
* missing = None

## Parameter Tuning

In [None]:
#%%time

#param_test = {
    #'n_estimators': [100, 500, 1000, 5000],
    #'max_depth': range(3, 10, 2),
    #'min_child_weight': range(1, 6, 2)
    #'max_depth': [2, 3, 4],
    #'min_child_weight': [0, 1, 2]
    #'gamma': [i/10.0 for i in range(0, 5)]
    #'subsample': [i/10.0 for i in range(6, 10)],
    #'colsample_bytree': [i/10.0 for i in range(6, 10)]
    #'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
    #'learning_rate': [0.1, 0.01, 0.001]     
#}

#gsearch = GridSearchCV(estimator = XGBClassifier(objective = 'multi:softprob', num_class = 7, 
#                                                 nthread = 4, seed = 1, n_estimators = 1000, 
#                                                 max_depth = 2, min_child_weight = 0,
#                                                gamma = 0), 
#                       param_grid = param_test, scoring = 'f1_samples', n_jobs = 4, cv = 5)

#gsearch.fit(X_train, y_train)

In [None]:
#gsearch1.best_params_

In [None]:
#gsearch1.best_score_

In [None]:
#gsearch.cv_results_

## Final XGBoost model with the best parameters:


In [None]:
# Using scorer 'f1-samples'

best_params = {
    'objective': 'multi:softprob', 
    'num_class': 7,
    'max_depth': 2, 
    'min_child_weight': 0,
    'gamma': 0,
    'colsample_bytree': 0.6, 
    'subsample': 0.6,
    'learning_rate': 0.1, 
    'n_estimators': 100,
    'seed': 1
}

best_xgb_clf = xgb.XGBClassifier(**best_params)
best_xgb_clf = best_xgb_clf.fit(X_train, y_train)

In [None]:
y_pred = best_xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print('Using scorer f1 samples')
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
#Using scorer 'accuracy'

best_params = {
    'objective': 'multi:softprob', 
    'num_class': 7,
    'max_depth': 7, 
    'min_child_weight': 1,
    'gamma': 0,
    'n_estimators': 1500,
    'seed': 1
}

best_xgb_clf = xgb.XGBClassifier(**best_params)
best_xgb_clf = best_xgb_clf.fit(X_train, y_train)

In [None]:
y_pred = best_xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print('Using scorer accuracy')
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.98% before tuning

Accuracy: 87.20% after tuning

# Submission

In [None]:
submission_data = test.drop(columns = ['Id'])

print("Submission data features shape:", submission_data.shape)

In [None]:
submission_predictions = best_xgb_clf.predict(submission_data)

In [None]:
df = pd.DataFrame({'Id': test['Id'], 'Cover_Type': submission_predictions})

# Convert cover types back to the range 1-7
df['Cover_Type'] = df['Cover_Type'].apply(lambda x: x + 1)

df.to_csv('submission.csv', index = None)

### Submission score: 0.73775