#Importing relevant Libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#Loading Testing and Training Dataset

In [0]:
PATH = '/content/drive/My Drive/Colab Notebooks/Capstone Project/'

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
test = pd.read_csv(PATH+'Data/model_test.csv')
train = pd.read_csv(PATH+'Data/model_train.csv')

In [0]:
test.drop('Unnamed: 0', axis=1, inplace = True)
train.drop('Unnamed: 0', axis=1, inplace = True)

In [0]:
train.dtypes

Heigh_From_Sea_Level                 float64
Aspect                               float64
Slope                                float64
Distance_To_Water_Source             float64
Standing_Distance_To_Water_Source    float64
Distance_To_Road                     float64
Shadow_In_Morning                    float64
Shadow_In_Midday                     float64
Shadow_In_Evening                    float64
Distance_To_Fire                     float64
Turf                                   int64
Neighbourhood_Type                     int64
Cat_Plant_Type                         int64
dtype: object

In [0]:
train.columns

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire', 'Turf', 'Neighbourhood_Type', 'Cat_Plant_Type'],
      dtype='object')

In [0]:
#Converting Categorical Variables into Category Type
catCols = train.columns[10:]
print(catCols)
train[catCols] = train[catCols].astype('category')
test[catCols] = test[catCols].astype('category')

Index(['Turf', 'Neighbourhood_Type', 'Cat_Plant_Type'], dtype='object')


#Model Training

In [0]:
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from time import time
from sklearn.metrics import classification_report 
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

##Computing Class Weights

In [0]:
from sklearn.utils import class_weight

In [0]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train['Cat_Plant_Type']),
                                                 train['Cat_Plant_Type'])

In [0]:
train['Cat_Plant_Type'].value_counts()

1    166816
0    125089
2     16101
6     10276
5      9116
4      5110
3       959
Name: Cat_Plant_Type, dtype: int64

In [0]:
class_weights = {i : class_weights[i] for i in range(len(class_weights))}
class_weights

{0: 0.38083398905693433,
 1: 0.28557298374941764,
 2: 2.9587070900653907,
 3: 49.67481007001341,
 4: 9.322532848755941,
 5: 5.2257725819595064,
 6: 4.635864427514875}

##Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [0]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [0]:
n = 5

xCol = xTrain.columns[0:10]

model = RandomForestClassifier()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)



Num Features: 5
Selected Features: [ True  True False False  True  True False False False  True]
Feature Ranking: [1 1 6 2 1 1 4 3 5 1]


In [0]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Slope', 'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [0]:
random_forest = RandomForestClassifier(n_estimators = 100, class_weight = class_weights, n_jobs = -1)

start = time()
random_forest.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = random_forest.predict(xTrain)

start=time()
yTestPred = random_forest.predict(xTest)
end=time()

print('The total testing time is ', round(end-start))

print('The accuracy of the Random Forest classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Random Forest classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  80
The total testing time is  3
The accuracy of the Random Forest classifier on training data is 1.00
The accuracy of the Random Forest classifier on test data is 0.96


In [0]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96     31159
           1       0.96      0.98      0.97     41693
           2       0.95      0.96      0.95      4061
           3       0.91      0.84      0.87       256
           4       0.95      0.80      0.87      1333
           5       0.93      0.92      0.92      2238
           6       0.98      0.95      0.96      2627

    accuracy                           0.96     83367
   macro avg       0.95      0.92      0.93     83367
weighted avg       0.96      0.96      0.96     83367



###Hyperparameter Tuning

In [0]:
est = RandomForestClassifier(random_state=0, n_jobs = -1)

depth = [5,6,10,None]

# Specify parameters and distributions to sample from
parameters = {
    "criterion": ['gini', 'entropy'],
    "max_depth": depth,
    "n_estimators" : [25,50,100,150]
}

random_search = RandomizedSearchCV(estimator = est,
                           param_distributions = parameters,
                           cv=2,
                           verbose = 2,
                           n_jobs = -1)

random_search = random_search.fit(xTrain, yTrain)
cv_results = pd.DataFrame.from_dict(random_search.cv_results_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.2min finished


In [0]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,20.714918,0.013051,3.166929,0.029236,100,5.0,gini,"{'n_estimators': 100, 'max_depth': 5, 'criteri...",0.694743,0.699878,0.69731,0.002567,8
1,45.243411,2.18907,4.699298,0.021902,100,10.0,entropy,"{'n_estimators': 100, 'max_depth': 10, 'criter...",0.794476,0.794802,0.794639,0.000163,4
2,18.372512,0.360373,3.117419,0.205237,25,,gini,"{'n_estimators': 25, 'max_depth': None, 'crite...",0.942262,0.942489,0.942375,0.000113,2
3,38.749441,0.390175,5.67015,0.143248,150,5.0,gini,"{'n_estimators': 150, 'max_depth': 5, 'criteri...",0.695385,0.69842,0.696903,0.001518,9
4,25.30388,0.291843,2.928924,0.317169,25,,entropy,"{'n_estimators': 25, 'max_depth': None, 'crite...",0.94535,0.946045,0.945698,0.000347,1
5,45.235703,0.558505,5.684596,0.019284,100,10.0,gini,"{'n_estimators': 100, 'max_depth': 10, 'criter...",0.786613,0.788235,0.787424,0.000811,5
6,14.603195,0.141339,2.347,0.057625,50,5.0,entropy,"{'n_estimators': 50, 'max_depth': 5, 'criterio...",0.697604,0.7047,0.701152,0.003548,7
7,15.412894,0.108927,2.387153,0.015919,50,6.0,gini,"{'n_estimators': 50, 'max_depth': 6, 'criterio...",0.712382,0.716419,0.714401,0.002019,6
8,6.48721,0.035751,1.356995,0.040561,25,5.0,gini,"{'n_estimators': 25, 'max_depth': 5, 'criterio...",0.689339,0.697527,0.693433,0.004094,10
9,87.118874,0.327109,7.386545,1.002827,150,10.0,entropy,"{'n_estimators': 150, 'max_depth': 10, 'criter...",0.795286,0.795846,0.795566,0.00028,3


In [0]:
rfc_best = random_search.best_estimator_
print(random_search.best_score_)
print(random_search.best_params_)

rfc_worst = random_search.cv_results_['params'][1]
print(rfc_worst)

0.9456977751921479
{'n_estimators': 25, 'max_depth': None, 'criterion': 'entropy'}
{'n_estimators': 100, 'max_depth': 10, 'criterion': 'entropy'}


We are having better metrics on the model with default params.

##Decision Tree

In [0]:
from sklearn import tree

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [0]:
n = 5

xCol = xTrain.columns[0:10]
print(xCol)
model = tree.DecisionTreeClassifier(criterion = 'gini')
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')
Num Features: 5
Selected Features: [ True False False  True False  True  True False False  True]
Feature Ranking: [1 4 6 1 3 1 1 2 5 1]


In [0]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Aspect', 'Slope', 'Standing_Distance_To_Water_Source', 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [0]:
decision_tree = tree.DecisionTreeClassifier(criterion = 'gini')

start = time()
decision_tree.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = decision_tree.predict(xTrain)

start = time()
yTestPred = decision_tree.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))

print('The accuracy of the Decision Tree Classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Decision Tree classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  3
Total testing time is  0
The accuracy of the Decision Tree Classifier on training data is 1.00
The accuracy of the Decision Tree classifier on test data is 0.93


In [0]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     31159
           1       0.94      0.94      0.94     41693
           2       0.92      0.91      0.92      4061
           3       0.82      0.78      0.80       256
           4       0.83      0.80      0.82      1333
           5       0.87      0.88      0.87      2238
           6       0.94      0.94      0.94      2627

    accuracy                           0.93     83367
   macro avg       0.89      0.88      0.89     83367
weighted avg       0.93      0.93      0.93     83367



###Hyperparameter Tuning

In [0]:
est = tree.DecisionTreeClassifier(random_state=0)

depth = [5,6,10,None]

# Specify parameters and distributions to sample from
parameters = {
    "criterion": ['gini', 'entropy'],
    "max_depth": depth
}

random_search = RandomizedSearchCV(estimator = est,
                           param_distributions = parameters,
                           cv=2,
                           verbose = 2,
                           n_jobs = -1)

random_search = random_search.fit(xTrain, yTrain)
cv_results = pd.DataFrame.from_dict(random_search.cv_results_)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:   14.4s finished


In [0]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.768446,0.001667,0.133769,0.002845,5.0,gini,"{'max_depth': 5, 'criterion': 'gini'}",0.711788,0.716665,0.714227,0.002438,7
1,0.871851,0.00395,0.140527,0.002993,6.0,gini,"{'max_depth': 6, 'criterion': 'gini'}",0.73561,0.729026,0.732318,0.003292,5
2,1.34503,0.009349,0.145312,0.002057,10.0,gini,"{'max_depth': 10, 'criterion': 'gini'}",0.788917,0.783623,0.78627,0.002647,3
3,2.209194,0.004697,0.181654,0.000507,,gini,"{'max_depth': None, 'criterion': 'gini'}",0.907842,0.907492,0.907667,0.000175,2
4,0.808599,0.001014,0.144464,0.009599,5.0,entropy,"{'max_depth': 5, 'criterion': 'entropy'}",0.705485,0.710374,0.707929,0.002444,8
5,0.981163,0.003516,0.136265,0.001657,6.0,entropy,"{'max_depth': 6, 'criterion': 'entropy'}",0.723327,0.720018,0.721673,0.001655,6
6,1.617321,0.004974,0.145203,0.000752,10.0,entropy,"{'max_depth': 10, 'criterion': 'entropy'}",0.777599,0.778357,0.777978,0.000379,4
7,2.73438,0.034026,0.146216,0.020541,,entropy,"{'max_depth': None, 'criterion': 'entropy'}",0.914709,0.914887,0.914798,8.9e-05,1


In [0]:
rfc_best = random_search.best_estimator_
print(random_search.best_score_)
print(random_search.best_params_)

rfc_worst = random_search.cv_results_['params'][1]
print(rfc_worst)

0.9147981659354599
{'max_depth': None, 'criterion': 'entropy'}
{'max_depth': 6, 'criterion': 'gini'}


We are having better metrics on the model with default params.

##KNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###Feature Elimination

In [0]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [0]:
from sklearn.linear_model import LogisticRegression

n = 5

xCol = xTrain.columns[0:10]

model = LogisticRegression(n_jobs=-1)
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)

  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


Num Features: 5
Selected Features: [ True False False  True False  True False  True  True False]
Feature Ranking: [1 6 2 1 4 1 3 1 1 5]


As KNN doesn't uses parameters which can define feature importance, we have use Logistic Regression for RFE.

In [0]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Aspect',
 'Standing_Distance_To_Water_Source',
 'Shadow_In_Morning',
 'Distance_To_Fire']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [0]:
knn = KNeighborsClassifier(n_neighbors = 7, p = 2, metric='minkowski', n_jobs=-1)

start = time()
knn.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = knn.predict(xTrain)

start = time()
yTestPred = knn.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))
print('The accuracy of the Logistic Regression classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Logistic Regression classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  18
Total testing time is  12
The accuracy of the Logistic Regression classifier on training data is 0.91
The accuracy of the Logistic Regression classifier on test data is 0.88


In [0]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87     31159
           1       0.88      0.91      0.90     41693
           2       0.85      0.86      0.86      4061
           3       0.78      0.61      0.68       256
           4       0.77      0.59      0.67      1333
           5       0.77      0.75      0.76      2238
           6       0.89      0.87      0.88      2627

    accuracy                           0.88     83367
   macro avg       0.83      0.78      0.80     83367
weighted avg       0.88      0.88      0.88     83367



###Hyperparameter Tuning

In [0]:
est = KNeighborsClassifier(n_jobs = -1)

# Specify parameters and distributions to sample from
parameters = {
    "n_neighbors" : [3,7,101],
    "p": [1,2],
}

random_search = RandomizedSearchCV(estimator = est,
                           param_distributions = parameters,
                           cv=2,
                           verbose = 2,
                           n_jobs = -1)

random_search = random_search.fit(xTrain, yTrain)
cv_results = pd.DataFrame.from_dict(random_search.cv_results_)

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  7.4min finished


In [0]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_p,param_n_neighbors,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,8.151367,0.039788,52.109726,0.721,1,3,"{'p': 1, 'n_neighbors': 3}",0.868702,0.868909,0.868806,0.000104,1
1,8.050981,0.34203,32.622793,0.17261,2,3,"{'p': 2, 'n_neighbors': 3}",0.861283,0.861286,0.861285,2e-06,3
2,7.410452,0.230301,62.355628,0.101446,1,7,"{'p': 1, 'n_neighbors': 7}",0.862752,0.864237,0.863495,0.000742,2
3,7.600647,0.3772,38.78488,0.19244,2,7,"{'p': 2, 'n_neighbors': 7}",0.852862,0.854437,0.85365,0.000787,4
4,7.438841,0.203605,129.521923,0.617373,1,101,"{'p': 1, 'n_neighbors': 101}",0.791819,0.790196,0.791008,0.000812,5
5,8.340291,1.378098,77.98765,1.996786,2,101,"{'p': 2, 'n_neighbors': 101}",0.782475,0.780612,0.781544,0.000932,6


In [0]:
knn_best = random_search.best_estimator_
print(random_search.best_score_)
print(random_search.best_params_)

knn_worst = random_search.cv_results_['params'][1]
print(knn_worst)

0.8688056089508107
{'p': 1, 'n_neighbors': 3}
{'p': 2, 'n_neighbors': 3}


We are having better metrics on the model with default params.

##Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###One-Hot Encoding

In [0]:
xTrain = pd.get_dummies(xTrain, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTest = pd.get_dummies(xTest, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTrain, xTest = xTrain.align(xTest, join = 'left', axis = 1)

###Feature Elimination

In [0]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [0]:
n = 5

xCol = xTrain.columns[0:10]

model = LogisticRegression()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)



Num Features: 5
Selected Features: [ True False False  True False  True False  True  True False]
Feature Ranking: [1 6 2 1 4 1 3 1 1 5]


In [0]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Aspect',
 'Standing_Distance_To_Water_Source',
 'Shadow_In_Morning',
 'Distance_To_Fire']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [0]:
logReg = LogisticRegression(n_jobs = -1)

start = time()
logReg.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = logReg.predict(xTrain)

start = time()
yTestPred = logReg.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))

print('The accuracy of the Logistic Regression classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Logistic Regression classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Total training time is  30
Total testing time is  0
The accuracy of the Logistic Regression classifier on training data is 0.72
The accuracy of the Logistic Regression classifier on test data is 0.72


Got a lower accuracy of (68%) using class_weight.

In [0]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.71      0.70      0.70     31159
           1       0.74      0.80      0.77     41693
           2       0.59      0.77      0.67      4061
           3       0.76      0.10      0.17       256
           4       0.22      0.01      0.02      1333
           5       0.43      0.18      0.25      2238
           6       0.79      0.47      0.59      2627

    accuracy                           0.72     83367
   macro avg       0.60      0.43      0.45     83367
weighted avg       0.71      0.72      0.71     83367



###Hyperparameter Tuning

In [0]:
est = LogisticRegression(random_state = 0)

# Specify parameters and distributions to sample from
parameters = {
    "penalty": ['l2'],
    "C": range(2,10),
    "solver": ['newton-cg', 'sag', 'saga','lbfgs'],
    "multi_class" : ['multinomial'],
    "max_iter" : [10, 50, 100, 200]
}

random_search = RandomizedSearchCV(estimator = est,
                           param_distributions = parameters,
                           cv=2,
                           verbose = 2,
                           n_jobs = -1)

random_search = random_search.fit(xTrain, yTrain)
cv_results = pd.DataFrame.from_dict(random_search.cv_results_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.5min finished


In [0]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_solver,param_penalty,param_multi_class,param_max_iter,param_C,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,31.241351,0.123836,0.150285,0.012798,sag,l2,multinomial,50,8,"{'solver': 'sag', 'penalty': 'l2', 'multi_clas...",0.723255,0.725278,0.724267,0.001011,5
1,7.926055,0.015881,0.149562,0.02687,saga,l2,multinomial,10,6,"{'solver': 'saga', 'penalty': 'l2', 'multi_cla...",0.723243,0.725584,0.724414,0.00117,4
2,90.139128,0.479549,0.119424,0.001312,saga,l2,multinomial,200,6,"{'solver': 'saga', 'penalty': 'l2', 'multi_cla...",0.723657,0.72538,0.724518,0.000861,2
3,47.723887,0.090245,0.151507,0.003153,sag,l2,multinomial,200,8,"{'solver': 'sag', 'penalty': 'l2', 'multi_clas...",0.723723,0.725404,0.724563,0.00084,1
4,3.187191,0.019574,0.144021,0.020259,lbfgs,l2,multinomial,10,9,"{'solver': 'lbfgs', 'penalty': 'l2', 'multi_cl...",0.689093,0.688428,0.688761,0.000333,9
5,320.641022,12.791149,0.142388,0.020665,newton-cg,l2,multinomial,50,7,"{'solver': 'newton-cg', 'penalty': 'l2', 'mult...",0.723663,0.725362,0.724512,0.000849,3
6,9.717816,0.14051,0.153886,0.008659,sag,l2,multinomial,10,5,"{'solver': 'sag', 'penalty': 'l2', 'multi_clas...",0.719627,0.713114,0.716371,0.003256,7
7,7.698993,0.352782,0.137082,0.017417,sag,l2,multinomial,10,8,"{'solver': 'sag', 'penalty': 'l2', 'multi_clas...",0.721696,0.666405,0.694051,0.027646,8
8,2.757857,0.064601,0.136798,0.00373,lbfgs,l2,multinomial,10,8,"{'solver': 'lbfgs', 'penalty': 'l2', 'multi_cl...",0.689087,0.688428,0.688758,0.00033,10
9,45.690735,0.306119,0.115718,0.029876,saga,l2,multinomial,100,2,"{'solver': 'saga', 'penalty': 'l2', 'multi_cla...",0.723171,0.724864,0.724018,0.000846,6


In [0]:
logReg_best = random_search.best_estimator_
print(random_search.best_score_)
print(random_search.best_params_)

logReg_worst = random_search.cv_results_['params'][1]
print(logReg_worst)

0.7245634500565273
{'solver': 'sag', 'penalty': 'l2', 'multi_class': 'multinomial', 'max_iter': 200, 'C': 8}
{'solver': 'saga', 'penalty': 'l2', 'multi_class': 'multinomial', 'max_iter': 10, 'C': 6}


We are having similar metrics on the model with best params.

##XGBoost

In [0]:
import xgboost as xgb

###Separating the data into target and predictor features.

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

###One-Hot Encoding

In [0]:
xTrain = pd.get_dummies(xTrain, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTest = pd.get_dummies(xTest, prefix_sep = '__', columns = ['Turf','Neighbourhood_Type'])
xTrain, xTest = xTrain.align(xTest, join = 'left', axis = 1)

###Feature Elimination

In [0]:
n = 5

xCol = xTrain.columns[0:10]

model = xgb.XGBClassifier()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)

Num Features: 5
Selected Features: [ True  True False  True False  True False  True False False]
Feature Ranking: [1 1 6 1 3 1 5 1 4 2]


In [0]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Slope',
 'Standing_Distance_To_Water_Source',
 'Shadow_In_Morning',
 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [0]:
col = xTrain.columns[10:]
col

Index(['Turf__5', 'Turf__6', 'Turf__7', 'Turf__8', 'Turf__9', 'Turf__10',
       'Turf__11', 'Turf__12', 'Turf__13', 'Turf__14', 'Turf__16', 'Turf__17',
       'Turf__18', 'Turf__19', 'Turf__20', 'Turf__21', 'Turf__22', 'Turf__23',
       'Turf__24', 'Turf__25', 'Turf__26', 'Turf__27', 'Turf__28', 'Turf__29',
       'Turf__30', 'Turf__31', 'Turf__32', 'Turf__33', 'Turf__34', 'Turf__35',
       'Turf__36', 'Turf__37', 'Turf__38', 'Turf__39', 'Turf__40',
       'Neighbourhood_Type__1', 'Neighbourhood_Type__2',
       'Neighbourhood_Type__3', 'Neighbourhood_Type__4'],
      dtype='object')

In [0]:
xTrain[col] = xTrain[col].astype(bool)
xTest[col] = xTest[col].astype(bool)

In [0]:
XGB = xgb.XGBClassifier()

start = time()
XGB.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = XGB.predict(xTrain)

start = time()
yTestPred = XGB.predict(xTest)
end = time()
print('Total testing time is ', round(end-start))
print('The accuracy of the XGB classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the XGB classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  393
Total testing time is  2
The accuracy of the XGB classifier on training data is 0.74
The accuracy of the XGB classifier on test data is 0.74


In [0]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.73      0.73      0.73     31159
           1       0.76      0.82      0.79     41693
           2       0.69      0.78      0.73      4061
           3       0.78      0.59      0.67       256
           4       0.97      0.02      0.05      1333
           5       0.56      0.25      0.34      2238
           6       0.82      0.46      0.59      2627

    accuracy                           0.74     83367
   macro avg       0.76      0.52      0.56     83367
weighted avg       0.75      0.74      0.73     83367

