#Importing relevant Libraries

In [0]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#Loading Testing and Training Dataset

In [0]:
PATH = '/content/drive/My Drive/Colab Notebooks/Capstone Project/'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
test = pd.read_csv(PATH+'Data/model_test.csv')
train = pd.read_csv(PATH+'Data/model_trainUpDownsampled.csv')

In [0]:
test.drop('Unnamed: 0', axis=1, inplace = True)
train.drop('Unnamed: 0', axis=1, inplace = True)

In [6]:
train.dtypes

Heigh_From_Sea_Level                 float64
Aspect                               float64
Slope                                float64
Distance_To_Water_Source             float64
Standing_Distance_To_Water_Source    float64
Distance_To_Road                     float64
Shadow_In_Morning                    float64
Shadow_In_Midday                     float64
Shadow_In_Evening                    float64
Distance_To_Fire                     float64
Turf                                   int64
Neighbourhood_Type                     int64
Cat_Plant_Type                         int64
dtype: object

In [7]:
train.columns

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire', 'Turf', 'Neighbourhood_Type', 'Cat_Plant_Type'],
      dtype='object')

In [8]:
#Converting Categorical Variables into Category Type
catCols = train.columns[10:]
print(catCols)
train[catCols] = train[catCols].astype('category')
test[catCols] = test[catCols].astype('category')

Index(['Turf', 'Neighbourhood_Type', 'Cat_Plant_Type'], dtype='object')


#Model Training

In [0]:
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from time import time
from sklearn.metrics import classification_report 
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

##Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [14]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [15]:
n = 5

xCol = xTrain.columns[0:10]

model = RandomForestClassifier()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)



Num Features: 5
Selected Features: [ True  True False  True False  True False False False  True]
Feature Ranking: [1 1 6 1 3 1 2 4 5 1]


In [16]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Slope',
 'Standing_Distance_To_Water_Source',
 'Shadow_In_Midday',
 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [19]:
random_forest = RandomForestClassifier(n_estimators = 100)

start = time()
random_forest.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = random_forest.predict(xTrain)

start = time()
yTestPred = random_forest.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))
print('The accuracy of the Random Forest classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Random Forest classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  37
Total testing time is  4
The accuracy of the Random Forest classifier on training data is 1.00
The accuracy of the Random Forest classifier on test data is 0.89


In [20]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89     31159
           1       0.93      0.87      0.90     41693
           2       0.84      0.97      0.90      4061
           3       0.88      0.87      0.87       256
           4       0.60      0.96      0.74      1333
           5       0.77      0.94      0.85      2238
           6       0.80      0.99      0.89      2627

    accuracy                           0.89     83367
   macro avg       0.82      0.93      0.86     83367
weighted avg       0.90      0.89      0.89     83367



###Hyperparameter Tuning

In [37]:
est = RandomForestClassifier(random_state=0, n_jobs = -1)

# Specify parameters and distributions to sample from
parameters = {
    "criterion": ['gini'],
    "max_depth": [None],
    "n_estimators" : [150,250]
}

random_search = RandomizedSearchCV(estimator = est,
                           param_distributions = parameters,
                           cv=2,
                           verbose = 2,
                           n_jobs = -1)

random_search = random_search.fit(xTrain, yTrain)
cv_results = pd.DataFrame.from_dict(random_search.cv_results_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.2min finished


In [38]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,22.899068,0.304381,3.152812,0.002635,150,,gini,"{'n_estimators': 150, 'max_depth': None, 'crit...",0.945064,0.942928,0.943996,0.001068,2
1,37.61486,0.870719,4.907988,0.452986,250,,gini,"{'n_estimators': 250, 'max_depth': None, 'crit...",0.945668,0.943301,0.944484,0.001183,1


In [39]:
rfc_best = random_search.best_estimator_
print(random_search.best_score_)
print(random_search.best_params_)

rfc_worst = random_search.cv_results_['params'][1]
print(rfc_worst)

0.9444843709796197
{'n_estimators': 250, 'max_depth': None, 'criterion': 'gini'}
{'n_estimators': 250, 'max_depth': None, 'criterion': 'gini'}


In [40]:
random_forest = RandomForestClassifier(n_estimators = 250)

start = time()
random_forest.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = random_forest.predict(xTrain)

start = time()
yTestPred = random_forest.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))
print('The accuracy of the Random Forest classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Random Forest classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  38
Total testing time is  6
The accuracy of the Random Forest classifier on training data is 1.00
The accuracy of the Random Forest classifier on test data is 0.90


We are having better metrics on the model with best params.

##Decision Tree

In [0]:
from sklearn import tree

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [44]:
n = 5

xCol = xTrain.columns[0:10]
print(xCol)
model = tree.DecisionTreeClassifier(criterion = 'gini')
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')
Num Features: 5
Selected Features: [ True False False  True False  True  True False False  True]
Feature Ranking: [1 4 5 1 3 1 1 2 6 1]


In [45]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Aspect', 'Slope', 'Standing_Distance_To_Water_Source', 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [47]:
decision_tree = tree.DecisionTreeClassifier(criterion = 'gini')

start = time()
decision_tree.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = decision_tree.predict(xTrain)

start = time()
yTestPred = decision_tree.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))

print('The accuracy of the Decision Tree Classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Decision Tree classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  1
Total testing time is  0
The accuracy of the Decision Tree Classifier on training data is 1.00
The accuracy of the Decision Tree classifier on test data is 0.83


In [48]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83     31159
           1       0.88      0.81      0.84     41693
           2       0.80      0.93      0.86      4061
           3       0.78      0.75      0.77       256
           4       0.49      0.90      0.64      1333
           5       0.74      0.86      0.79      2238
           6       0.76      0.96      0.85      2627

    accuracy                           0.83     83367
   macro avg       0.75      0.86      0.80     83367
weighted avg       0.84      0.83      0.83     83367



##KNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [52]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [58]:
from sklearn.linear_model import LogisticRegression

n = 5

xCol = xTrain.columns[0:10]

model = LogisticRegression()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)



Num Features: 5
Selected Features: [ True False  True  True  True False False  True]
Feature Ranking: [1 2 1 1 1 3 4 1]


In [59]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Distance_To_Fire', 'Turf']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [61]:
knn = KNeighborsClassifier(n_neighbors = 7, p = 2, metric='minkowski')

start = time()
knn.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = knn.predict(xTrain)

start = time()
yTestPred = knn.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))
print('The accuracy of the KNN classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the KNN classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  3
Total testing time is  6
The accuracy of the KNN classifier on training data is 0.89
The accuracy of the KNN classifier on test data is 0.69


In [62]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.76      0.70      0.73     31159
           1       0.84      0.64      0.73     41693
           2       0.68      0.80      0.74      4061
           3       0.48      0.85      0.61       256
           4       0.17      0.93      0.29      1333
           5       0.46      0.83      0.59      2238
           6       0.42      0.97      0.59      2627

    accuracy                           0.69     83367
   macro avg       0.54      0.82      0.61     83367
weighted avg       0.77      0.69      0.71     83367



##Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###One-Hot Encoding

In [0]:
xTrain = pd.get_dummies(xTrain, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTest = pd.get_dummies(xTest, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTrain, xTest = xTrain.align(xTest, join = 'left', axis = 1)

###Feature Elimination

In [67]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [68]:
n = 5

xCol = xTrain.columns[0:10]

model = LogisticRegression()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)



Num Features: 5
Selected Features: [ True False False False False  True  True  True  True False]
Feature Ranking: [1 6 2 4 5 1 1 1 1 3]


In [69]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Aspect',
 'Distance_To_Water_Source',
 'Standing_Distance_To_Water_Source',
 'Distance_To_Fire']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [71]:
logReg = LogisticRegression()

start = time()
logReg.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = logReg.predict(xTrain)
yTestPred = logReg.predict(xTest)
print('The accuracy of the Logistic Regression classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Logistic Regression classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))



Total training time is  4
The accuracy of the Logistic Regression classifier on training data is 0.68
The accuracy of the Logistic Regression classifier on test data is 0.55


In [72]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.67      0.63      0.65     31159
           1       0.78      0.47      0.58     41693
           2       0.56      0.55      0.56      4061
           3       0.22      0.94      0.35       256
           4       0.07      0.67      0.12      1333
           5       0.30      0.61      0.40      2238
           6       0.35      0.88      0.50      2627

    accuracy                           0.55     83367
   macro avg       0.42      0.68      0.45     83367
weighted avg       0.69      0.55      0.59     83367



##XGBoost

In [0]:
import xgboost as xgb

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###One-Hot Encoding

In [0]:
xTrain = pd.get_dummies(xTrain, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTest = pd.get_dummies(xTest, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTrain, xTest = xTrain.align(xTest, join = 'left', axis = 1)

###Feature Elimination

In [77]:
n = 5

xCol = xTrain.columns[0:10]

model = xgb.XGBClassifier()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)

Num Features: 5
Selected Features: [ True  True False  True False False  True  True False False]
Feature Ranking: [1 1 6 1 5 3 1 1 4 2]


In [78]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Slope',
 'Standing_Distance_To_Water_Source',
 'Distance_To_Road',
 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [80]:
xTrain.dtypes

Heigh_From_Sea_Level        float64
Aspect                      float64
Distance_To_Water_Source    float64
Shadow_In_Morning           float64
Shadow_In_Midday            float64
Distance_To_Fire            float64
Turf__1                       uint8
Turf__2                       uint8
Turf__3                       uint8
Turf__4                       uint8
Turf__5                       uint8
Turf__6                       uint8
Turf__7                       uint8
Turf__8                       uint8
Turf__9                       uint8
Turf__10                      uint8
Turf__11                      uint8
Turf__12                      uint8
Turf__13                      uint8
Turf__14                      uint8
Turf__16                      uint8
Turf__17                      uint8
Turf__18                      uint8
Turf__19                      uint8
Turf__20                      uint8
Turf__21                      uint8
Turf__22                      uint8
Turf__23                    

In [81]:
col = xTrain.columns[6:]
print(col)

xTrain[col] = xTrain[col].astype(bool)
xTest[col] = xTest[col].astype(bool)

Index(['Turf__1', 'Turf__2', 'Turf__3', 'Turf__4', 'Turf__5', 'Turf__6',
       'Turf__7', 'Turf__8', 'Turf__9', 'Turf__10', 'Turf__11', 'Turf__12',
       'Turf__13', 'Turf__14', 'Turf__16', 'Turf__17', 'Turf__18', 'Turf__19',
       'Turf__20', 'Turf__21', 'Turf__22', 'Turf__23', 'Turf__24', 'Turf__25',
       'Turf__26', 'Turf__27', 'Turf__28', 'Turf__29', 'Turf__30', 'Turf__31',
       'Turf__32', 'Turf__33', 'Turf__34', 'Turf__35', 'Turf__36', 'Turf__37',
       'Turf__38', 'Turf__39', 'Turf__40', 'Neighbourhood_Type__1',
       'Neighbourhood_Type__2', 'Neighbourhood_Type__3',
       'Neighbourhood_Type__4'],
      dtype='object')


In [82]:
XGB = xgb.XGBClassifier(n_estimators = 100)

start = time()
XGB.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = XGB.predict(xTrain)
yTestPred = XGB.predict(xTest)
print('The accuracy of the XGB classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the XGB classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  95
The accuracy of the XGB classifier on training data is 0.76
The accuracy of the XGB classifier on test data is 0.57


In [83]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66     31159
           1       0.81      0.45      0.58     41693
           2       0.64      0.64      0.64      4061
           3       0.35      0.97      0.51       256
           4       0.09      0.89      0.17      1333
           5       0.35      0.71      0.47      2238
           6       0.34      0.95      0.50      2627

    accuracy                           0.57     83367
   macro avg       0.46      0.75      0.51     83367
weighted avg       0.71      0.57      0.60     83367

