# DSCI-598 Capstone
## Maryville University
### November - December 2023
### Alison Hawke

## Support Vector Machine

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

## Exploring the data set

In [None]:
train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv', dtype = str)
train.shape

In [None]:
train.head

In [None]:
test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv', dtype = str)
test.shape

In [None]:
train.isnull().sum()

In [None]:
train.value_counts(['Cover_Type']).sort_index()

In [None]:
X_num_train = train.loc[:, ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                   'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                   'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']].values
X_cat_train = train.loc[:, ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 
                      'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 
                      'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 
                      'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 
                      'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 
                      'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
                      'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29',
                      'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
                      'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
                      'Soil_Type40',]].values
y = train.loc[:, 'Cover_Type'].values

print('Numerical Feature Array Shape:   ', X_num_train.shape)
print('Categorical Feature Array Shape: ', X_cat_train.shape)
print('Label Array Shape:               ', y.shape)

In [None]:
# join arrays
X = np.hstack((X_num_train, X_cat_train))

print('Feature Array Shape:', X.shape)

In [None]:
X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
X_valid, X_test, y_valid, y_test = train_test_split(X_hold, y_hold, test_size = 0.2, random_state = 1, stratify = y_hold)

# Support Vector Model

In [None]:
svc_model = SVC()
svc_model.fit(X_train, y_train)
predictions = svc_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

In [None]:
%%time

param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]} 
grid = GridSearchCV(SVC(),param_grid,refit = True,verbose = 2)
grid.fit(X_train, y_train)

In [None]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test, grid_predictions))

# Predicting the test set

In [None]:
X_num_test = test.loc[:, ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                   'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                   'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']].values
X_cat_test = test.loc[:, ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 
                      'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 
                      'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 
                      'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 
                      'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 
                      'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
                      'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29',
                      'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
                      'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
                      'Soil_Type40',]].values

In [None]:
%%time

X_test = np.hstack((X_num_test, X_cat_test))
grid_predictions = grid.predict(X_test)

# Submission

In [None]:
Id = np.asarray(pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')['Id'])

print(f'Id: ', Id.shape)
print('grid_predictions: ', grid_predictions.shape)

submission = pd.DataFrame({'Id':Id, 'Cover_Type':grid_predictions})
submission.to_csv('submission.csv', header = True, index = False)