# DSCI-598 Capstone
## Maryville University
### November - December 2023
### Alison Hawke

## Feature Engineering

In [None]:
import numpy as np 
import pandas as pd 
import xgboost as xgb
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy import stats
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
# Hiding FutureWarnings messages

import warnings
#warnings.filterwarnings('ignore')

In [None]:
train_raw = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
test_raw = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')

In [None]:
train_raw.head()

# Feature engineering

In order to know which features to add or combine, I used a correlation matrix. This shows how strongly correlated features are to each other.

In [None]:
train = train_raw
test = test_raw

print('Training features shape:', train.shape)
print('Test features shape:', test.shape)

In [None]:
corrMatrix = train.corr()

corrAbs = corrMatrix.abs().unstack()
corrSorted = corrAbs.sort_values(ascending = False).drop_duplicates()

corrSorted.head(50)

The following output shows the data types of each feature in the training set

In [None]:
train.dtypes

## Direct distance to hydrology

Use the vertical and horizontal distance to hydrology to get a direct distance value

In [None]:
train['Distance_to_Hydrolody'] = np.sqrt(train['Horizontal_Distance_To_Hydrology']**2 + train['Vertical_Distance_To_Hydrology']**2)
test['Distance_to_Hydrolody'] = np.sqrt(test['Horizontal_Distance_To_Hydrology']**2 + test['Vertical_Distance_To_Hydrology']**2)

## Remove soil types 7 and 15

These features have no entries in the training set, and therefore will not be useful in predicting the test set.

In [None]:
train = train.drop(columns = ['Id','Soil_Type7', 'Soil_Type15']) 
test = test.drop(columns = ['Soil_Type7', 'Soil_Type15']) 

## Remove outliers

The following code removes records containing values that are more than 3 standard deviations away from the mean for that column. This should remove data that could distort the training set.

This step did not improve the score of the final model.

In [None]:
train[(np.abs(stats.zscore(train)) < 3).all(axis = 1)]
test[(np.abs(stats.zscore(test)) < 3).all(axis = 1)]

## Combining Hillshade features

These features are correlated with each other and could be combined to improve model accuracy

In [None]:
train['Hillshade'] = train['Hillshade_9am'] + train['Hillshade_3pm'] + train['Hillshade_Noon']
test['Hillshade'] = test['Hillshade_9am'] + test['Hillshade_3pm'] + test['Hillshade_Noon']

## Binning the Elevation feature

The Elevation is the most important feature in predicting the cover type. Using bins to split it into several grouped values could improve accuracy.

In [None]:
train['binned_elev'] = [math.floor(v/50.0) for v in train['Elevation']]
test['binned_elev'] = [math.floor(v/50.0) for v in test['Elevation']]

## Combining the elevation and distance to fire points features

Creating a new feature with the sum of elevation and distance to fire points

In [None]:
train['Elevation_Fire_Points'] = train['Elevation'] + train['Horizontal_Distance_To_Fire_Points']
test['Elevation_Fire_Points'] = test['Elevation'] + test['Horizontal_Distance_To_Fire_Points']

## Combining distance to road and distance to fire points features

Creating a new feature with the distance to roadways and distance to fire points.

In [None]:
train['Road_plus_Fire'] = train['Horizontal_Distance_To_Roadways'] + train['Horizontal_Distance_To_Fire_Points']
test['Road_plus_Fire'] = test['Horizontal_Distance_To_Roadways'] + test['Horizontal_Distance_To_Fire_Points']

## Final training features

In [None]:
print('Final training features shape:', train.shape)
print('Final test features shape:', test.shape)

In [None]:
train.head()

## Train test split

In [None]:
X = train.drop(columns = ['Cover_Type'])

# Change cover types from 1-7 to 0-6
# Set as categorical variable
y = train['Cover_Type'].apply(lambda x: x - 1).astype('category')

print("Pre-split training features shape:", X.shape)
print("Pre-split training label shape:", y.shape)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.9, test_size = 0.1, random_state = 5, stratify = y)

print("Post-split training feature shape:", X_train.shape)
print("Post-split training label shape:", y_train.shape)

# Category Boost Classifier

Accuracy: 0.8757

In [None]:
cbc = CatBoostClassifier(random_state = 20, iterations = 3000, learning_rate = 0.03, od_wait = 1000,
                         depth = 7, l2_leaf_reg = 3, eval_metric = 'Accuracy', verbose = 1000)
cbc.fit(X_train, y_train)

pred_valid_cbc = cbc.predict(X_valid)
print(metrics.accuracy_score(y_valid, pred_valid_cbc))

# Extra Trees Classifier

Accuracy: 0.8981

In [None]:
etc = ExtraTreesClassifier(random_state = 20, n_jobs = -1, max_features = 'auto')

etc.fit(X_train, y_train)
pred_valid_etc = etc.predict(X_valid)

temp = test.drop(columns = ['Id']) 
pred_test_etc = etc.predict(temp)

print(metrics.accuracy_score(y_valid, pred_valid_etc))

# XGBoost Classifier

Accuracy: 0.8108

In [None]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical = True)
dtest_clf = xgb.DMatrix(X_valid, y_valid, enable_categorical = True)

In [None]:
# Seven cover types
params = {'objective': 'multi:softprob', 'num_class': 7}
n = 100

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round = n,
   nfold = 5,
   metrics = ['mlogloss', 'auc', 'merror'],
)

In [None]:
print(results)

In [None]:
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
params = {'objective': 'multi:softprob', 'num_class': 7, 'n_estimators': 100, 
          'max_depth': 5, 'learning_rate': 0.1}

xgb_clf = xgb.XGBClassifier(**params)
xgb_clf = xgb_clf.fit(X_train, y_train)

In [None]:
y_pred = xgb_clf.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
print('Accuracy: ', round(accuracy, 4))

## XGBoost model with the best parameters:

Accuracy: 0.8849

In [None]:
# Using scorer 'accuracy'

best_params = {
    'objective': 'multi:softprob', 
    'num_class': 7,
    'max_depth': 7, 
    'min_child_weight': 1,
    'gamma': 0,
    'n_estimators': 1500,
    'seed': 1
}

best_xgb_clf = xgb.XGBClassifier(**best_params)
best_xgb_clf = best_xgb_clf.fit(X_train, y_train)

In [None]:
y_pred = best_xgb_clf.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
print('Accuracy: ', round(accuracy, 4))

Accuracy: 81.08% with default parameters

Accuracy: 88.49% with previous best params

(In previous notebook
Accuracy without additional features: 87.20%)

# Submission

In [None]:
submission_data = test.drop(columns = ['Id']) 

print("Submission data features shape:", submission_data.shape)

In [None]:
#Best XGBoost
#submission_predictions = best_xgb_clf.predict(submission_data)

#CatBoost
#submission_predictions = cbc.predict(submission_data)

#Extra Trees
submission_predictions = etc.predict(submission_data)

In [None]:
df = pd.DataFrame({'Id': test['Id'], 'Cover_Type': submission_predictions})

# Convert cover types back to the range 1-7
df['Cover_Type'] = df['Cover_Type'].apply(lambda x: x + 1)

df.to_csv('submission.csv', index = None)

Extra Trees submission score: 0.74848 (added hydrology distance feature and removed outliers)

Extra Trees submission score: 0.74974 (combined hillshade feature and binned elevation)

Extra Trees submission score: 0.76476 (combined elevation and fire points feature)

**Extra Trees submission score: 0.77866 (combined distance to road and distance to fire points features)**