#Importing relevant Libraries

In [0]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#Loading Dataset

In [0]:
PATH = '/content/drive/My Drive/Colab Notebooks/Capstone Project/'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train = pd.read_csv(PATH+'Data/capstone_train.csv')
test = pd.read_csv(PATH+'Data/capstone_test.csv')

In [5]:
train.shape

(464809, 13)

#Label Encoding On "Plant_Type"

In [0]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
train['Cat_Plant_Type'] = label_encoder.fit_transform(train['Plant_Type'])
test['Cat_Plant_Type'] = label_encoder.fit_transform(test['Plant_Type'])

In [0]:
train.drop('Plant_Type',axis=1, inplace=True)
test.drop('Plant_Type',axis=1, inplace=True)

#Training a Linear Regression Model on all features to impute "Shadow_In_Midday"

In [0]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics

In [0]:
trainWithoutNan = train[train['Shadow_In_Midday'].notnull()]

In [0]:
x = trainWithoutNan.drop('Shadow_In_Midday', axis = 1)
x = x.values
y = trainWithoutNan['Shadow_In_Midday']
y = y.values

In [0]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = .2, 
                                                random_state = 5)

In [0]:
reg = linear_model.LinearRegression()

In [13]:
reg.fit(xTrain, yTrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
print('Variance score: ', reg.score(xTest, yTest))

Variance score:  0.9787326800157982


In [0]:
yTrainPred = reg.predict(xTrain)
yTestPred = reg.predict(xTest)

In [16]:
print('r2 score on Train: ', metrics.r2_score(yTrain, yTrainPred))
print('r2 score on Test: ', metrics.r2_score(yTest, yTestPred))

r2 score on Train:  0.9742834615935139
r2 score on Test:  0.9787326800157982


#Imputing "Shadow_In_Midday"

##On Train

In [0]:
trainWithNan = train[train['Shadow_In_Midday'].isnull()]
inputFeatures = trainWithNan.drop('Shadow_In_Midday', axis = 1)

In [0]:
targetPred = reg.predict(inputFeatures)

In [19]:
train['Shadow_In_Midday'][train['Shadow_In_Midday'].isnull()] = targetPred.round().astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
train.shape

(464809, 13)

##On Test

In [0]:
testWithNan = test[test['Shadow_In_Midday'].isnull()]
inputFeatures = testWithNan.drop('Shadow_In_Midday', axis = 1)

In [0]:
targetPred = reg.predict(inputFeatures)

In [23]:
test['Shadow_In_Midday'][test['Shadow_In_Midday'].isnull()] = targetPred.round().astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


#Transformations on Test and Train data

In [24]:
skew = train[train.columns[:-3]].skew().abs().sort_values(ascending = False)
skew

Standing_Distance_To_Water_Source    1.793613
Distance_To_Fire                     1.288951
Shadow_In_Morning                    1.181143
Distance_To_Water_Source             1.140487
Shadow_In_Midday                     1.128504
Heigh_From_Sea_Level                 0.817019
Slope                                0.789069
Distance_To_Road                     0.711927
Aspect                               0.403099
Shadow_In_Evening                    0.274835
dtype: float64

In [0]:
cols = skew.index

In [0]:
#On Train
train['Distance_To_Fire'] = np.cbrt(train['Distance_To_Fire'])
train['Shadow_In_Morning'] = train['Shadow_In_Morning']**3
train['Distance_To_Water_Source'] = np.sqrt(train['Distance_To_Water_Source']+1000)
train['Shadow_In_Midday'] = train['Shadow_In_Midday']**3
train['Heigh_From_Sea_Level'] = train['Heigh_From_Sea_Level']**2
train['Slope'] = np.sqrt(train['Slope'])
train['Distance_To_Road'] = np.sqrt(train['Distance_To_Road'])
train['Aspect'] = np.sqrt(train['Aspect'])

#On Test
test['Distance_To_Fire'] = np.cbrt(test['Distance_To_Fire'])
test['Shadow_In_Morning'] = test['Shadow_In_Morning']**3
test['Distance_To_Water_Source'] = np.sqrt(test['Distance_To_Water_Source']+1000)
test['Shadow_In_Midday'] = test['Shadow_In_Midday']**3
test['Heigh_From_Sea_Level'] = test['Heigh_From_Sea_Level']**2
test['Slope'] = np.sqrt(test['Slope'])
test['Distance_To_Road'] = np.sqrt(test['Distance_To_Road'])
test['Aspect'] = np.sqrt(test['Aspect'])

In [27]:
train.describe()

Unnamed: 0,Heigh_From_Sea_Level,Aspect,Slope,Distance_To_Water_Source,Standing_Distance_To_Water_Source,Distance_To_Road,Shadow_In_Morning,Shadow_In_Midday,Shadow_In_Evening,Distance_To_Fire,Turf,Neighbourhood_Type,Cat_Plant_Type
count,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0
mean,8836818.0,11.466793,3.615718,35.510599,46.390412,45.604191,9982988.0,11388250.0,142.518338,11.946007,24.370455,2.113563,1.051563
std,1596067.0,4.916701,1.013088,2.878893,58.240912,16.493568,3231641.0,2727363.0,38.27367,2.765204,9.478907,1.061051,1.397023
min,3455881.0,0.0,0.0,31.622777,-173.0,0.0,0.0,-216000.0,0.0,0.0,1.0,1.0,0.0
25%,7890481.0,7.615773,3.0,33.286634,7.0,33.256578,7762392.0,9663597.0,119.0,10.079368,20.0,1.0,0.0
50%,8976016.0,11.269428,3.605551,34.899857,30.0,44.698993,10360230.0,11543180.0,143.0,11.958188,29.0,2.0,1.0
75%,10004570.0,16.155494,4.242641,37.20215,69.0,57.714816,12326390.0,13312050.0,168.0,13.661972,31.0,3.0,1.0
max,14884160.0,18.973666,8.062258,48.887626,601.0,84.362314,16387060.0,16777220.0,254.0,19.28562,40.0,4.0,6.0


In [28]:
test.describe()

Unnamed: 0,Heigh_From_Sea_Level,Aspect,Slope,Distance_To_Water_Source,Standing_Distance_To_Water_Source,Distance_To_Road,Shadow_In_Morning,Shadow_In_Midday,Shadow_In_Evening,Distance_To_Fire,Turf,Neighbourhood_Type,Cat_Plant_Type
count,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0
mean,8833901.0,11.465836,3.618493,35.519595,46.532628,45.524876,9974360.0,11391830.0,142.567963,11.948767,24.330396,2.118061,1.0511
std,1601082.0,4.916696,1.012916,2.887372,58.512119,16.465212,3226332.0,2740602.0,38.278104,2.754328,9.511329,1.062266,1.394434
min,3500641.0,0.0,0.0,31.622777,-166.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,7884864.0,7.615773,3.0,33.286634,7.0,33.18132,7762392.0,9663597.0,119.0,10.082648,20.0,1.0,0.0
50%,8982009.0,11.269428,3.605551,34.899857,29.0,44.620623,10360230.0,11543180.0,143.0,11.960518,29.0,3.0,1.0
75%,10010900.0,16.124515,4.242641,37.282704,69.0,57.576037,12326390.0,13312050.0,168.0,13.665543,31.0,3.0,1.0
max,14884160.0,18.973666,8.124038,48.959167,597.0,84.356387,16387060.0,16581380.0,252.0,19.260494,40.0,4.0,6.0


#Outlier Removal on Train Data

In [0]:
trainCopy = train.copy()

In [30]:
for col in cols:
  q25, q50, q75 = train[col].quantile([.25,.50,.75])

  IQR = q75-q25
  min_ = q25 - 1.5*IQR
  max_ = q75 + 1.5*IQR

  numRec = train.shape[0]
  
  train = train[train[col]>min_]
  train = train[train[col]<max_]
  
  print("Data Loss (%) in ", col, " is ", 100 - train.shape[0]*100/numRec, '; IQR is ', IQR)

Data Loss (%) in  Standing_Distance_To_Water_Source  is  5.486554692357501 ; IQR is  62.0
Data Loss (%) in  Distance_To_Fire  is  0.3630718381450748 ; IQR is  3.608116334747592
Data Loss (%) in  Shadow_In_Morning  is  0.22640457652519785 ; IQR is  4445792.0
Data Loss (%) in  Distance_To_Water_Source  is  1.1013896744145626 ; IQR is  3.787393613622953
Data Loss (%) in  Shadow_In_Midday  is  1.2414594673439723 ; IQR is  3511709.0
Data Loss (%) in  Heigh_From_Sea_Level  is  1.2221339166191996 ; IQR is  2093013.0
Data Loss (%) in  Slope  is  0.14050543983898933 ; IQR is  1.4142135623730945
Data Loss (%) in  Distance_To_Road  is  0.0 ; IQR is  24.407502574379635
Data Loss (%) in  Aspect  is  0.0 ; IQR is  8.35007379401279
Data Loss (%) in  Shadow_In_Evening  is  0.929306181430988 ; IQR is  47.0


In [31]:
print("Total Data Loss (%) : ", 100 - len(train)*100/len(trainCopy))

Total Data Loss (%) :  10.32144386188736


#Separating into Target and Predictor Features

In [0]:
yTest = test['Cat_Plant_Type']
xTest = test.drop('Cat_Plant_Type', axis=1)

In [0]:
yTrain = train['Cat_Plant_Type']
xTrain = train.drop('Cat_Plant_Type', axis=1)

#Training Model

In [0]:
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from time import time
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight

###Computing Class Weights

In [0]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train['Cat_Plant_Type']),
                                                 train['Cat_Plant_Type'])

In [36]:
class_weights = {i : class_weights[i] for i in range(len(class_weights))}
class_weights

{0: 0.3811102496397668,
 1: 0.28558822058383226,
 2: 2.9534626666855615,
 3: 49.01046443268665,
 4: 9.24223409680495,
 5: 5.24464631721986,
 6: 4.615028620143709}

###Feature Elimination

In [37]:
xTrain.columns

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire', 'Turf', 'Neighbourhood_Type'],
      dtype='object')

In [38]:
xTrain.columns[0:10]

Index(['Heigh_From_Sea_Level', 'Aspect', 'Slope', 'Distance_To_Water_Source',
       'Standing_Distance_To_Water_Source', 'Distance_To_Road',
       'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening',
       'Distance_To_Fire'],
      dtype='object')

In [39]:
n = 5

xCol = xTrain.columns[0:10]

model = RandomForestClassifier()
rfe = RFE(model, n)
fit = rfe.fit(xTrain[xCol], yTrain)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
ranks = list(fit.ranking_)



Num Features: 5
Selected Features: [ True  True False False  True  True False False False  True]
Feature Ranking: [1 1 6 2 1 1 4 3 5 1]


In [40]:
dropCols = []

for i in range(len(ranks)):
  if ranks[i]>2:
    dropCols.append(xCol[i])
    
dropCols

['Slope', 'Shadow_In_Morning', 'Shadow_In_Midday', 'Shadow_In_Evening']

In [0]:
xTest.drop(dropCols, axis=1, inplace= True)
xTrain.drop(dropCols, axis=1, inplace= True)

###Training Model and Metrics

In [42]:
random_forest = RandomForestClassifier(n_estimators = 100, class_weight = class_weights, n_jobs = -1)

start = time()
random_forest.fit(xTrain, yTrain)
end = time()

print('Total training time is ', round(end-start))

yTrainPred = random_forest.predict(xTrain)

start=time()
yTestPred = random_forest.predict(xTest)
end=time()

print('The total testing time is ', round(end-start))

print('The accuracy of the Random Forest classifier on training data is {:.2f}'.format(accuracy_score(yTrain, yTrainPred)))
print('The accuracy of the Random Forest classifier on test data is {:.2f}'.format(accuracy_score(yTest,yTestPred)))

Total training time is  109
The total testing time is  6
The accuracy of the Random Forest classifier on training data is 1.00
The accuracy of the Random Forest classifier on test data is 0.95


In [43]:
print(classification_report(yTest,yTestPred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     42353
           1       0.96      0.97      0.97     56596
           2       0.91      0.94      0.93      7266
           3       0.90      0.76      0.83       563
           4       0.94      0.79      0.86      1895
           5       0.91      0.84      0.87      3436
           6       0.97      0.90      0.93      4094

    accuracy                           0.95    116203
   macro avg       0.94      0.88      0.91    116203
weighted avg       0.95      0.95      0.95    116203

