#Importing relevant Libraries

In [0]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#Loading Testing and Training Dataset

In [0]:
PATH = '/content/drive/My Drive/Colab Notebooks/Capstone Project/'

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
test = pd.read_csv(PATH+'Data/model_testSmote.csv')
train = pd.read_csv(PATH+'Data/model_trainSmote.csv')

In [0]:
test.drop('Unnamed: 0', axis=1, inplace = True)
train.drop('Unnamed: 0', axis=1, inplace = True)

In [0]:
train.dtypes

Heigh_From_Sea_Level                 float64
Aspect                               float64
Slope                                float64
Distance_To_Water_Source             float64
Standing_Distance_To_Water_Source    float64
Distance_To_Road                     float64
Shadow_In_Morning                    float64
Shadow_In_Midday                     float64
Shadow_In_Evening                    float64
Distance_To_Fire                     float64
Turf                                   int64
Neighbourhood_Type                     int64
Cat_Plant_Type                         int64
dtype: object

In [0]:
train.columns

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire', 'Turf', 'Neighbourhood_Type', 'Cat_Plant_Type'],
      dtype='object')

In [0]:
#Converting Categorical Variables into Category Type
catCols = train.columns[10:]
print(catCols)
train[catCols] = train[catCols].astype('category')
test[catCols] = test[catCols].astype('category')

Index(['Turf', 'Neighbourhood_Type', 'Cat_Plant_Type'], dtype='object')


#Model Training

In [0]:
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from time import time
from sklearn.metrics import classification_report 
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

##Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [0]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [0]:
n = 5

xCol = xTrain.columns[0:10]

model = RandomForestClassifier(n_jobs=-1)
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)



Num Features: 5
Selected Features: [ True  True False  True False  True False False False  True]
Feature Ranking: [1 1 6 1 3 1 2 4 5 1]


In [0]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Slope',
 'Standing_Distance_To_Water_Source',
 'Shadow_In_Midday',
 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [0]:
random_forest = RandomForestClassifier(n_estimators = 100)

start = time()
random_forest.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = random_forest.predict(xTrain)

start = time()
yTestPred = random_forest.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))

print('The accuracy of the Random Forest classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Random Forest classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  253
Total testing time is  4
The accuracy of the Random Forest classifier on training data is 1.00
The accuracy of the Random Forest classifier on test data is 0.96


In [0]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     31159
           1       0.96      0.98      0.97     41693
           2       0.94      0.96      0.95      4061
           3       0.89      0.82      0.85       256
           4       0.92      0.86      0.89      1333
           5       0.92      0.93      0.92      2238
           6       0.96      0.98      0.97      2627

    accuracy                           0.96     83367
   macro avg       0.94      0.92      0.93     83367
weighted avg       0.96      0.96      0.96     83367



##Decision Tree

In [0]:
from sklearn import tree

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [0]:
n = 5

xCol = xTrain.columns[0:10]
print(xCol)
model = tree.DecisionTreeClassifier(criterion = 'gini')
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')
Num Features: 5
Selected Features: [ True False False  True False  True  True False False  True]
Feature Ranking: [1 4 6 1 3 1 1 2 5 1]


In [0]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Aspect', 'Slope', 'Standing_Distance_To_Water_Source', 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [0]:
decision_tree = tree.DecisionTreeClassifier(criterion = 'gini')

start = time()
decision_tree.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = decision_tree.predict(xTrain)

start=time()
yTestPred = decision_tree.predict(xTest)
end=time()

print('Total testing time is ', round(end-start))
print('The accuracy of the Decision Tree Classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Decision Tree classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  22
Total testing time is  0
The accuracy of the Decision Tree Classifier on training data is 1.00
The accuracy of the Decision Tree classifier on test data is 0.93


In [0]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92     31159
           1       0.93      0.95      0.94     41693
           2       0.91      0.91      0.91      4061
           3       0.80      0.73      0.76       256
           4       0.83      0.77      0.80      1333
           5       0.86      0.85      0.85      2238
           6       0.93      0.93      0.93      2627

    accuracy                           0.93     83367
   macro avg       0.88      0.86      0.87     83367
weighted avg       0.93      0.93      0.93     83367

