# DSCI-598 Capstone
## Maryville University
### November - December 2023
### Alison Hawke

## K Nearest Neighbours model

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## Exploring the data set

In [None]:
train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv', dtype = str)
train.shape

In [None]:
test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv', dtype = str)
test.shape

## Using a limited feature set based on the Gini importance threshold

In [None]:
X_num_train = train.loc[:, ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                   'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                   'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']].values
X_cat_train = train.loc[:, ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 
                      'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 
                      'Soil_Type5', 'Soil_Type6', 'Soil_Type10', 'Soil_Type11', 
                      'Soil_Type12', 'Soil_Type13', 'Soil_Type16', 'Soil_Type17', 
                      'Soil_Type20', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 
                      'Soil_Type29','Soil_Type30', 'Soil_Type31', 'Soil_Type32', 
                      'Soil_Type33','Soil_Type35',   'Soil_Type38', 'Soil_Type39',
                      'Soil_Type40',]].values
y = train.loc[:, 'Cover_Type'].values

print('Numerical Feature Array Shape:   ', X_num_train.shape)
print('Categorical Feature Array Shape: ', X_cat_train.shape)
print('Label Array Shape:               ', y.shape)

In [None]:
# join arrays
X = np.hstack((X_num_train, X_cat_train))

print('Feature Array Shape:', X.shape)

In [None]:
X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
X_valid, X_test, y_valid, y_test = train_test_split(X_hold, y_hold, test_size = 0.2, random_state = 1, stratify = y_hold)

# K Nearest Neighbors Regression model

[Model information](https://scikit-learn.org/stable/modules/neighbors.html) from scikit-learn

In [None]:
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
train_acc = accuracy_score(y_test, y_pred)
print('Training accuracy: ', round(train_acc, 4))

# Predicting the test set

Using the reduced feature set.

In [None]:
X_num_test = test.loc[:, ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                   'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                   'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']].values
X_cat_test = test.loc[:, ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 
                      'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 
                      'Soil_Type5', 'Soil_Type6', 'Soil_Type10', 'Soil_Type11', 
                      'Soil_Type12', 'Soil_Type13', 'Soil_Type16', 'Soil_Type17', 
                      'Soil_Type20', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 
                      'Soil_Type29','Soil_Type30', 'Soil_Type31', 'Soil_Type32', 
                      'Soil_Type33','Soil_Type35',   'Soil_Type38', 'Soil_Type39',
                      'Soil_Type40',]].values

X_submission = np.hstack((X_num_test, X_cat_test))

print('Numerical Feature Array Shape:   ', X_num_test.shape)
print('Categorical Feature Array Shape: ', X_cat_test.shape)
print('Feature Array Shape:', X_submission.shape)

In [None]:
%%time

y_pred = knn.predict(X_submission)

# Submission with limited feature set

In [None]:
Id = np.asarray(pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')['Id'])

print(f'Id: ', Id.shape)
print('y_pred: ', y_pred.shape)

submission = pd.DataFrame({'Id':Id, 'Cover_Type':y_pred})
submission.to_csv('submission.csv', header = True, index = False)

**Submission score: 0.61554** (using v5)

Training accuracy:  0.7967

# Comparison with the full data set

In [None]:
X = train.drop(['Id', 'Cover_Type'], axis = 1)
X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
X_valid, X_test, y_valid, y_test = train_test_split(X_hold, y_hold, test_size = 0.2, random_state = 1, stratify = y_hold)

In [None]:
knn_full = KNeighborsClassifier()

knn_full.fit(X_train, y_train)
y_pred = knn_full.predict(X_test)

In [None]:
train_acc = accuracy_score(y_test, y_pred)
print('Training accuracy: ', round(train_acc, 4))

# Predicting the test set using all features

In [None]:
X_submission = test.drop(['Id'], axis = 1)
y_pred = knn_full.predict(X_submission)

submission = pd.DataFrame({'Id':Id, 'Cover_Type':y_pred})
submission.to_csv('submission.csv', header = True, index = False)

**Submission score: 0.61554** (using v10)

Training accuracy:  0.7967