Author: Raymond Fleming
Course: Machine Learning Fall 2023
Chosen Classifier: Random Forest
Leaderboard Accuracy: .94373 (94.373%)

In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/miniproject3fall2023data/train.csv
/kaggle/input/miniproject3fall2023data/test.csv


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as mpl
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Read train and test csv's into pandas dataframes
train=pd.read_csv("/kaggle/input/miniproject3fall2023data/train.csv")
test=pd.read_csv("/kaggle/input/miniproject3fall2023data/test.csv")

### Check for simple null values first

In [8]:
#look for null values in training data
train.isnull().sum()

ID                                    0
Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0


In [9]:
train.shape

(464809, 56)

### There appears to be a large (56) number of features for this dataset, reducing this may improve predictive performance.

### Splitting the target column from the training set, and dropping a column identified below as possibly unneeded

In [10]:
X=train
y=train.Cover_Type
X.drop(['Cover_Type'],inplace=True,axis=1)
X.drop(['Aspect'],inplace=True,axis=1)

#'Horizontal_Distance_To_Hydrology' and 'Aspect' were identified below as candidate columns to drop, but in testing only dropping Aspect provided a better result
#X.drop(['Horizontal_Distance_To_Hydrology'],inplace=True,axis=1)
cleanTest=test
cleanTest.drop(['Aspect'],inplace=True,axis=1)
#cleanTest.drop(['Horizontal_Distance_To_Hydrology'],inplace=True,axis=1)


### SoilType and WildernessType are One-Hot encoded.  This generates a large number of features, it may be better to enumerate these features into a single column.  This will be done by first creating a new dataframe with those columns, combining it into a single enumerated column, then removing the columns from the dataset, then adding the single enumerated columns back in. This is performed on both the train and test datasets.

In [11]:
#Generate a new DF with the soiltype columns for training,then test dataset
SoilType=X.iloc[:,14:54]
SoilType

Unnamed: 0,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
464805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
464806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
464807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
SoilTypeTest=test.iloc[:,14:54]
SoilTypeTest

Unnamed: 0,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116199,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116200,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116201,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
#Generate a new DF with the wildernessarea columns for training, then test dataset
WildernessType=X.iloc[:,10:14]
WildernessType

Unnamed: 0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4
0,1,0,0,0
1,0,0,1,0
2,1,0,0,0
3,0,0,1,0
4,1,0,0,0
...,...,...,...,...
464804,1,0,0,0
464805,0,1,0,0
464806,1,0,0,0
464807,1,0,0,0


In [14]:
WildernessTypeTest=test.iloc[:,10:14]
WildernessTypeTest

Unnamed: 0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
116198,0,0,1,0
116199,0,0,0,1
116200,1,0,0,0
116201,0,0,1,0


In [15]:
#Create a single column with the name of the column containing a 1 for soiltype in train and test sets
SoilTypeSingleCol=SoilType.idxmax(axis=1)
SoilTypeTestSingleCol=SoilTypeTest.idxmax(axis=1)

In [16]:
#Create a single column with the name of the column containing a 1 for wildernesstype in train and test sets
WildernessTypeSingleCol=WildernessType.idxmax(axis=1)
WildernessTypeTestSingleCol=WildernessTypeTest.idxmax(axis=1)

In [17]:
#Add a column name
SoilTypes=pd.DataFrame(SoilTypeSingleCol,columns=["Type"])
WildernessTypes=pd.DataFrame(WildernessTypeSingleCol,columns=["Type"])
SoilTypesTest=pd.DataFrame(SoilTypeTestSingleCol,columns=["Type"])
WildernessTypesTest=pd.DataFrame(WildernessTypeTestSingleCol,columns=["Type"])

In [18]:
#Enumerate soiltypes column in train set
class_mapping={label:idx for idx, label in (enumerate(np.unique(SoilTypes['Type'])))}
SoilTypes['Type']=SoilTypes['Type'].map(class_mapping)
class_mapping

{'Soil_Type1': 0,
 'Soil_Type10': 1,
 'Soil_Type11': 2,
 'Soil_Type12': 3,
 'Soil_Type13': 4,
 'Soil_Type14': 5,
 'Soil_Type15': 6,
 'Soil_Type16': 7,
 'Soil_Type17': 8,
 'Soil_Type18': 9,
 'Soil_Type19': 10,
 'Soil_Type2': 11,
 'Soil_Type20': 12,
 'Soil_Type21': 13,
 'Soil_Type22': 14,
 'Soil_Type23': 15,
 'Soil_Type24': 16,
 'Soil_Type25': 17,
 'Soil_Type26': 18,
 'Soil_Type27': 19,
 'Soil_Type28': 20,
 'Soil_Type29': 21,
 'Soil_Type3': 22,
 'Soil_Type30': 23,
 'Soil_Type31': 24,
 'Soil_Type32': 25,
 'Soil_Type33': 26,
 'Soil_Type34': 27,
 'Soil_Type35': 28,
 'Soil_Type36': 29,
 'Soil_Type37': 30,
 'Soil_Type38': 31,
 'Soil_Type39': 32,
 'Soil_Type4': 33,
 'Soil_Type40': 34,
 'Soil_Type5': 35,
 'Soil_Type6': 36,
 'Soil_Type7': 37,
 'Soil_Type8': 38,
 'Soil_Type9': 39}

In [19]:
#Enumerate soiltypes column in test set
class_mapping={label:idx for idx, label in (enumerate(np.unique(SoilTypesTest['Type'])))}
SoilTypesTest['Type']=SoilTypesTest['Type'].map(class_mapping)
class_mapping

{'Soil_Type1': 0,
 'Soil_Type10': 1,
 'Soil_Type11': 2,
 'Soil_Type12': 3,
 'Soil_Type13': 4,
 'Soil_Type14': 5,
 'Soil_Type16': 6,
 'Soil_Type17': 7,
 'Soil_Type18': 8,
 'Soil_Type19': 9,
 'Soil_Type2': 10,
 'Soil_Type20': 11,
 'Soil_Type21': 12,
 'Soil_Type22': 13,
 'Soil_Type23': 14,
 'Soil_Type24': 15,
 'Soil_Type25': 16,
 'Soil_Type26': 17,
 'Soil_Type27': 18,
 'Soil_Type28': 19,
 'Soil_Type29': 20,
 'Soil_Type3': 21,
 'Soil_Type30': 22,
 'Soil_Type31': 23,
 'Soil_Type32': 24,
 'Soil_Type33': 25,
 'Soil_Type34': 26,
 'Soil_Type35': 27,
 'Soil_Type36': 28,
 'Soil_Type37': 29,
 'Soil_Type38': 30,
 'Soil_Type39': 31,
 'Soil_Type4': 32,
 'Soil_Type40': 33,
 'Soil_Type5': 34,
 'Soil_Type6': 35,
 'Soil_Type7': 36,
 'Soil_Type8': 37,
 'Soil_Type9': 38}

In [20]:
#Enumerate wildernesstype column in train set
class_mapping={label:idx for idx, label in (enumerate(np.unique(WildernessTypes['Type'])))}
WildernessTypes['Type']=WildernessTypes['Type'].map(class_mapping)
class_mapping

{'Wilderness_Area1': 0,
 'Wilderness_Area2': 1,
 'Wilderness_Area3': 2,
 'Wilderness_Area4': 3}

In [21]:
#Enumerate wildernesstype column in test set
class_mapping={label:idx for idx, label in (enumerate(np.unique(WildernessTypesTest['Type'])))}
WildernessTypesTest['Type']=WildernessTypesTest['Type'].map(class_mapping)
class_mapping

{'Wilderness_Area1': 0,
 'Wilderness_Area2': 1,
 'Wilderness_Area3': 2,
 'Wilderness_Area4': 3}

In [22]:
X

Unnamed: 0,ID,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,261835,3171,20,270,48,2758,223,248,147,3194,...,0,0,0,0,0,0,0,0,0,0
1,549115,3067,26,0,0,3396,198,252,176,792,...,0,0,1,0,0,0,0,0,0,0
2,7086,3382,4,162,23,3445,211,234,162,2259,...,0,0,0,0,1,0,0,0,0,0
3,436845,3067,9,150,16,2813,203,247,182,641,...,0,0,0,0,0,0,0,0,0,0
4,177146,2720,19,162,41,175,230,196,90,1852,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464804,28756,2837,18,256,36,3502,211,253,170,6359,...,0,0,0,0,0,0,0,0,0,0
464805,552822,3374,13,270,45,933,240,233,121,4121,...,0,0,0,0,0,0,0,1,0,0
464806,126667,3142,7,60,3,2976,229,228,132,3467,...,0,0,0,0,0,0,0,0,0,0
464807,213960,3212,9,67,9,4674,196,236,179,420,...,0,0,0,0,0,0,0,1,0,0


In [23]:
#Creating a list of soiltype columns to drop
SoilTypeList=["Soil_Type1","Soil_Type2","Soil_Type3","Soil_Type4","Soil_Type5","Soil_Type6","Soil_Type7","Soil_Type8","Soil_Type9","Soil_Type10","Soil_Type11","Soil_Type12","Soil_Type13","Soil_Type14","Soil_Type15","Soil_Type16","Soil_Type17","Soil_Type18","Soil_Type19","Soil_Type20","Soil_Type21","Soil_Type22","Soil_Type23","Soil_Type24","Soil_Type25","Soil_Type26","Soil_Type27","Soil_Type28","Soil_Type29","Soil_Type30","Soil_Type31","Soil_Type32","Soil_Type33","Soil_Type34","Soil_Type35","Soil_Type36","Soil_Type37","Soil_Type38","Soil_Type39","Soil_Type40"]

In [24]:
#Creating a list of wilderness area columns to drop
WildernessTypeList=["Wilderness_Area1","Wilderness_Area2","Wilderness_Area3","Wilderness_Area4"]

In [25]:
#Drop the listed soiltype columns from train and test sets
X.drop(SoilTypeList,inplace=True,axis=1)
cleanTest.drop(SoilTypeList,inplace=True,axis=1)

In [26]:
#Drop the wildernesstype columns from train and test sets
X.drop(WildernessTypeList,inplace=True,axis=1)
cleanTest.drop(WildernessTypeList,inplace=True,axis=1)

In [27]:
#Insert the enumerated columns into the training and test datasets
X.insert(column='WildernessArea',loc=10,value=WildernessTypes)
cleanTest.insert(column='WildernessArea',loc=10,value=WildernessTypesTest)

In [28]:
X

Unnamed: 0,ID,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,WildernessArea
0,261835,3171,20,270,48,2758,223,248,147,3194,0
1,549115,3067,26,0,0,3396,198,252,176,792,2
2,7086,3382,4,162,23,3445,211,234,162,2259,0
3,436845,3067,9,150,16,2813,203,247,182,641,2
4,177146,2720,19,162,41,175,230,196,90,1852,0
...,...,...,...,...,...,...,...,...,...,...,...
464804,28756,2837,18,256,36,3502,211,253,170,6359,0
464805,552822,3374,13,270,45,933,240,233,121,4121,1
464806,126667,3142,7,60,3,2976,229,228,132,3467,0
464807,213960,3212,9,67,9,4674,196,236,179,420,0


In [29]:
train.dtypes

ID                                    int64
Elevation                             int64
Slope                                 int64
Horizontal_Distance_To_Hydrology      int64
Vertical_Distance_To_Hydrology        int64
Horizontal_Distance_To_Roadways       int64
Hillshade_9am                         int64
Hillshade_Noon                        int64
Hillshade_3pm                         int64
Horizontal_Distance_To_Fire_Points    int64
WildernessArea                        int64
dtype: object

train.hist(bins=50,figsize=(30,20),layout=(28,2))

In [30]:
X

Unnamed: 0,ID,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,WildernessArea
0,261835,3171,20,270,48,2758,223,248,147,3194,0
1,549115,3067,26,0,0,3396,198,252,176,792,2
2,7086,3382,4,162,23,3445,211,234,162,2259,0
3,436845,3067,9,150,16,2813,203,247,182,641,2
4,177146,2720,19,162,41,175,230,196,90,1852,0
...,...,...,...,...,...,...,...,...,...,...,...
464804,28756,2837,18,256,36,3502,211,253,170,6359,0
464805,552822,3374,13,270,45,933,240,233,121,4121,1
464806,126667,3142,7,60,3,2976,229,228,132,3467,0
464807,213960,3212,9,67,9,4674,196,236,179,420,0


In [31]:
Xcorr=pd.concat([y,X])
Xcorr=X
Xcorr.insert(0,"y",y)
CorMat=Xcorr.corr()
CorMat.iloc[:,1:10]

Unnamed: 0,ID,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm
y,-0.009637,-0.269669,0.148842,-0.020612,0.081548,-0.153505,-0.035899,-0.09661,-0.048104
ID,1.0,0.232048,0.179268,0.108779,0.129577,-0.382682,-0.085912,0.014815,0.050653
Elevation,0.232048,1.0,-0.242644,0.305803,0.092733,0.365586,0.111728,0.205409,0.0593
Slope,0.179268,-0.242644,1.0,-0.010632,0.273814,-0.216485,-0.326418,-0.527273,-0.176525
Horizontal_Distance_To_Hydrology,0.108779,0.305803,-0.010632,1.0,0.606455,0.071627,-0.026603,0.046589,0.051869
Vertical_Distance_To_Hydrology,0.129577,0.092733,0.273814,0.606455,1.0,-0.046451,-0.164928,-0.1109,0.034052
Horizontal_Distance_To_Roadways,-0.382682,0.365586,-0.216485,0.071627,-0.046451,1.0,0.034929,0.189682,0.105823
Hillshade_9am,-0.085912,0.111728,-0.326418,-0.026603,-0.164928,0.034929,1.0,0.010051,-0.780379
Hillshade_Noon,0.014815,0.205409,-0.527273,0.046589,-0.1109,0.189682,0.010051,1.0,0.594089
Hillshade_3pm,0.050653,0.0593,-0.176525,0.051869,0.034052,0.105823,-0.780379,0.594089,1.0


In [32]:
CorMat.iloc[:,10:20]

Unnamed: 0,Horizontal_Distance_To_Fire_Points,WildernessArea
y,-0.109093,0.276099
ID,-0.367427,0.568213
Elevation,0.147488,-0.31587
Slope,-0.18564,0.286733
Horizontal_Distance_To_Hydrology,0.051572,0.056237
Vertical_Distance_To_Hydrology,-0.070542,0.189129
Horizontal_Distance_To_Roadways,0.331491,-0.446969
Hillshade_9am,0.132823,-0.232407
Hillshade_Noon,0.057899,-0.082224
Hillshade_3pm,-0.047815,0.103307


In [33]:
X.dtypes

y                                     int64
ID                                    int64
Elevation                             int64
Slope                                 int64
Horizontal_Distance_To_Hydrology      int64
Vertical_Distance_To_Hydrology        int64
Horizontal_Distance_To_Roadways       int64
Hillshade_9am                         int64
Hillshade_Noon                        int64
Hillshade_3pm                         int64
Horizontal_Distance_To_Fire_Points    int64
WildernessArea                        int64
dtype: object

In [34]:
X.drop(['y'],inplace=True,axis=1)

In [37]:
X.shape

(464809, 11)

### Feature count is now 11, down from 56. With the exception of the one dropped column 'Aspect', this was completed without a loss of information.

In [35]:
#split training dataset into labeled test and training data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

In [36]:
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
sc.fit(X_train)

#scale the split train/test sets
XTrainScaled=sc.transform(X_train)
XTestScaled=sc.transform(X_test)

#scale the test dataset
testscaled=sc.transform(cleanTest)

In [38]:
from sklearn.linear_model import Perceptron

#create a perceptron instance
Perceptron=Perceptron(eta0=.01,random_state=1)

#fit scaled training data
Perceptron.fit(XTrainScaled,y_train)

#predict with scaled test data
y_pred=Perceptron.predict(XTestScaled)

#check accuracy of model
print(f"Perceptron Accuracy:{accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred, average='macro')}")

#Predict the target column using the scaled test set
pred=Perceptron.predict(testscaled)

#Output the predictions into the format required (columns: ID #0) into CSV format
out=pd.DataFrame({'ID':test.ID,'#0':pred})
out.to_csv('PerceptronSubmission.csv',index=False)

Perceptron Accuracy:0.5775765007924385
F1: 0.4204453360262597


### Perceptron's F1 score is awful at 42.04%, meaning that it would not make a good model choice. However, it is very fast to train compared to the other models.

In [39]:
from sklearn.linear_model import LogisticRegression

#create a logistic regression instance
LogRes=LogisticRegression(C=1,solver='newton-cg',multi_class='ovr',max_iter=1000,penalty='l2')

#fit logistic regression model
LogRes.fit(XTrainScaled,y_train)

#Predict then test accuracy of predictions of logistic regression model using split test set
y_pred=LogRes.predict(XTestScaled)
print(f"Accuracy:{accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred, average='macro')}")

#Predict the target column using the scaled test set
pred=LogRes.predict(testscaled)

#Output the predictions into the format required (columns: ID #0) into CSV format
out=pd.DataFrame({'ID':test.ID,'#0':pred})
out.to_csv('LogResSubmission.csv',index=False)

Accuracy:0.6840859706116478
F1: 0.4321332341092529


### Logistic Regression provides a score of 43.2%.  This is slightly better but not much better than the perceptron. While its possible that hyperparameter tuning may improve this score, it is unlikely to more than double the current accuracy to reach the needed threshold.

In [48]:
from sklearn.svm import SVC

#create a support vector machine instance
svm=SVC(kernel='rbf',C=1,gamma='auto',random_state=1)


#fit support vector machine model
svm.fit(XTrainScaled,y_train)

#Predict then test accuracy of predictions of the support vector machine model using split test set
y_pred=svm.predict(XTestScaled)
print(f"Accuracy:{accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred, average='macro')}")

#Predict the target column using the scaled test set
pred=svm.predict(testscaled)

#Output the predictions into the format required (columns: ID #0) into CSV format
out=pd.DataFrame({'ID':test.ID,'#0':pred})
out.to_csv('SVMSubmission.csv',index=False)

Accuracy:0.8088035971687356
F1: 0.7092092212902158


### The support vector machine has an accuracy of 70.91 which is lower than desired, and this model took just under 3 hours to train. It is not a good candidate model for this data.

In [42]:
from sklearn.tree import DecisionTreeClassifier

#Create a decision tree classifier instance
Tree=DecisionTreeClassifier(criterion='gini',max_depth=160,random_state=1)

#Create a decision tree classifier instance
Tree.fit(XTrainScaled,y_train)

#Predict then test accuracy of predictions of logistic regression model using split test set
y_pred=Tree.predict(XTestScaled)
print(f"Accuracy:{accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred, average='macro')}")

#Predict the target column using the scaled test set
pred=Tree.predict(testscaled)

#Output the predictions into the format required (columns: ID #0) into CSV format
out=pd.DataFrame({'ID':test.ID,'#0':pred})
out.to_csv('TreeSubmission.csv',index=False)

Accuracy:0.9463149817488149
F1: 0.9044301907409092


### Decision Tree provides a score of 90.44%, this is much better than the perceptron or logistic regression. Training is also quick, in comparison to the more complex models used. This is a valid possible model, and depending on requirements could be used rather than the chosen random forest which, while a bit better in accuracy, takes a much longer time to train.

In [47]:
from sklearn.neighbors import KNeighborsClassifier

#Create a K nearest neighbors instance
knn=KNeighborsClassifier(n_neighbors=3,p=1,metric='minkowski')

#Fit K nearest neighbors model
knn.fit(XTrainScaled,y_train)

#Predict then test accuracy of predictions of logistic regression model using split test set
y_pred=knn.predict(XTestScaled)
print(f"Accuracy:{accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred, average='macro')}")

#Predict the target column using the scaled test set
pred=knn.predict(testscaled)

#Output the predictions into the format required (columns: ID #0) into CSV format
out=pd.DataFrame({'ID':test.ID,'#0':pred})
out.to_csv('KNNSubmission.csv',index=False)

Accuracy:0.932796913434163
F1: 0.8811344216347065


### K Nearest Neighbors provides a score of 88.11%, considerably higher than perceptron and logistic regression, but somewhat less than the decision tree model, and also has a longer training time and overhead. This is not the best model to use for this dataset.

In [46]:
from sklearn.ensemble import RandomForestClassifier

#Create a random forest instance
RandomForest=RandomForestClassifier(n_estimators=1000,random_state=1,n_jobs=2,max_features=5)
#Fit random forest model
RandomForest.fit(XTrainScaled,y_train)

#Predict then test accuracy of predictions of logistic regression model using split test set
y_pred=RandomForest.predict(XTestScaled)
print(f"Accuracy:{accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred, average='macro')}")

#Predict the target column using the scaled test set
pred=RandomForest.predict(testscaled)

#Output the predictions into the format required (columns: ID #0) into CSV format
out=pd.DataFrame({'ID':test.ID,'Cover_Type':pred})
out.to_csv('RandomForestSubmission.csv',index=False)

Accuracy:0.9712714155604799
F1: 0.9435738063738042


### Random Forest provides a score of 94.36%, this is the best score found. Prior to hyperparameter tuning, it was slightly less at around 93%. For this project, the hyperparameter tuning was done manually. The training time for this model is long enough that a grid search, or larger random search would take a significant amount of time. With the F1 score achieved, a grid or random search is not necessary.

### The random forest model was chosen as the final model. This model achieved an accuracy score of 94.373% when submitted.   In production, depending on resources and time needed either the random forest or the decision tree models could be used.   The decision tree is much fater to train, but loses around 5% accuracy in doing so. It is possible that with a full grid search, or further data processing that a slightly higher score could be achieved, however it is also likely that doing so will create an overfit condition.   The score achieved is well above the benchmark submission of 92.406%, so no further work was necessary. 