Import Libaries

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score

from scipy.stats import pearsonr
from scipy import stats

import math

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as ltb

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten

import warnings
from joblib import dump, load

  from pandas import MultiIndex, Int64Index


# Training

Load Dataset

In [2]:
data = pd.read_csv("wetSSN.csv")
data.head()

Unnamed: 0,FID,Eastings,Northings,No_of_Larv,LST,NDVI,NDMI
0,0,667759.1603,1410456.418,70,25.5909,0.134806,-0.051258
1,1,667970.2328,1410159.893,79,25.389299,0.188839,-0.042621
2,2,668100.4408,1410251.599,80,26.0898,0.289951,0.026311
3,3,668075.4149,1410344.91,75,26.104,0.14367,-0.032491
4,4,668321.5382,1409718.525,75,25.640499,0.197746,-0.105824


In [3]:
data.corr()

Unnamed: 0,FID,Eastings,Northings,No_of_Larv,LST,NDVI,NDMI
FID,1.0,-0.80312,-0.740688,-0.452733,0.291489,0.126604,-0.036504
Eastings,-0.80312,1.0,0.953737,0.460898,-0.186046,-0.248028,-0.056659
Northings,-0.740688,0.953737,1.0,0.444846,-0.160568,-0.269592,-0.044405
No_of_Larv,-0.452733,0.460898,0.444846,1.0,-0.153043,-0.159321,0.019477
LST,0.291489,-0.186046,-0.160568,-0.153043,1.0,-0.197172,-0.691377
NDVI,0.126604,-0.248028,-0.269592,-0.159321,-0.197172,1.0,0.558894
NDMI,-0.036504,-0.056659,-0.044405,0.019477,-0.691377,0.558894,1.0


* Drop unecessary columns
* Split Dataset

In [4]:
target = "No_of_Larv"
y = data[target]
X = data.drop(columns = ["FID", "Eastings", "Northings", target])

# cutoff = int(len(X) * 0.8)

# X_train, y_train = X.iloc[:cutoff], y.iloc[:cutoff]
# X_test, y_test = X.iloc[cutoff:], y.iloc[cutoff:]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.20, random_state = 7)

X_train.head()

Unnamed: 0,LST,NDVI,NDMI
79,25.621401,0.242078,-0.019283
386,23.2934,0.395142,0.244493
28,26.513901,0.220413,-0.085882
41,24.960699,0.548239,0.181547
206,26.8918,0.29077,0.067642


Get Baseline

In [5]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", round(y_train.mean(), 2))
print("Baseline MAE:", round(mae_baseline, 2))

Mean P2 Reading: 55.1
Baseline MAE: 28.58


FOREST MODEL

In [6]:
ForestModel = RandomForestRegressor(criterion = "absolute_error", n_estimators = 250, min_samples_split = 8, min_samples_leaf = 2)
ForestModel.fit(X_train, y_train)

y_pred = ForestModel.predict(X_test)


print ('MAE is:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE is:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE is:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print ('R.Sqd is:', r2_score(y_test, y_pred))
#print ('Cor is:', math.sqrt(r2_score(y_test, y_pred)))

MAE is: 23.598652173913045
MSE is: 894.6939618260869
RMSE is: 29.911435302005938
R.Sqd is: -0.2634164783581221


GBoost

In [7]:
GB = GradientBoostingRegressor(n_estimators = 300, learning_rate = 1.0,
    max_depth = 1, random_state = 0).fit(X_train, y_train)
GB.fit(X_train, y_train)
y_pred = GB.predict(X_test)

print ('MAE is:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE is:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE is:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE is: 27.25617178097075
MSE is: 1310.8002926181184
RMSE is: 36.20497607537006


XGBoost

In [8]:
xg_reg = XGBRegressor()
xg_reg.fit(X_train, y_train)

y_pred = xg_reg.predict(X_test)

print ('MAE is:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE is:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE is:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print ('R.Sqd is:', r2_score(y_test, y_pred))
#print ('Cor is:', math.sqrt(r2_score(y_test, y_pred)))

MAE is: 28.639390380486198
MSE is: 1287.4874684166073
RMSE is: 35.88157561223597
R.Sqd is: -0.8180885897085255


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


CatBoost

In [9]:
model = CatBoostRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print ('MAE is:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE is:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE is:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print ('R.Sqd is:', r2_score(y_test, y_pred))
#print ('Cor is:', math.sqrt(r2_score(y_test, y_pred)))

Learning rate set to 0.034961
0:	learn: 39.9106342	total: 139ms	remaining: 2m 18s
1:	learn: 39.7130897	total: 144ms	remaining: 1m 11s
2:	learn: 39.5869287	total: 148ms	remaining: 49.1s
3:	learn: 39.4653625	total: 152ms	remaining: 37.8s
4:	learn: 39.3653193	total: 156ms	remaining: 31s
5:	learn: 39.2347301	total: 160ms	remaining: 26.5s
6:	learn: 39.1351710	total: 164ms	remaining: 23.2s
7:	learn: 38.9976189	total: 168ms	remaining: 20.8s
8:	learn: 38.9017346	total: 172ms	remaining: 18.9s
9:	learn: 38.7734790	total: 176ms	remaining: 17.4s
10:	learn: 38.6462686	total: 180ms	remaining: 16.2s
11:	learn: 38.5572190	total: 181ms	remaining: 14.9s
12:	learn: 38.4136819	total: 182ms	remaining: 13.8s
13:	learn: 38.2994495	total: 183ms	remaining: 12.9s
14:	learn: 38.1892669	total: 184ms	remaining: 12.1s
15:	learn: 38.0791012	total: 185ms	remaining: 11.3s
16:	learn: 37.9918744	total: 185ms	remaining: 10.7s
17:	learn: 37.9017575	total: 186ms	remaining: 10.2s
18:	learn: 37.7931556	total: 187ms	remaining

Light Boosting

In [10]:
ltbmodel = ltb.LGBMRegressor()
ltbmodel.fit(X_train, y_train)

y_pred = ltbmodel.predict(X_test)

print ('MAE is:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE is:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE is:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print ('R.Sqd is:', r2_score(y_test, y_pred))
#print ('Cor is:', math.sqrt(r2_score(y_test, y_pred)))

MAE is: 26.82407095984951
MSE is: 1296.82622928761
RMSE is: 36.01147357839734
R.Sqd is: -0.8312760536630035


warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category = DeprecationWarning)


NN_model = Sequential()

NN_model.add(Dense(128, kernel_initializer = 'normal', input_dim = X.shape[1], activation = 'relu'))

NN_model.add(Dense(256, kernel_initializer = 'normal', activation = 'relu'))
NN_model.add(Dense(256, kernel_initializer = 'normal', activation = 'relu'))
NN_model.add(Dense(256, kernel_initializer = 'normal', activation = 'relu'))

NN_model.add(Dense(1, kernel_initializer = 'normal', activation = 'linear'))

NN_model.summary()

NN_model.compile(loss = 'mean_absolute_error', optimizer = 'adam', metrics = ['mean_absolute_error'])

checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor = 'val_loss', verbose = 1, save_best_only = True, mode = 'auto')
callbacks_list = [checkpoint]

NN_model.fit(X_train, y_train, epochs = 700, batch_size = 32, validation_split = 0.2, callbacks = callbacks_list)

y_pred = NN_model.predict(X_test)

print ('MAE is:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE is:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE is:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print ('R.Sqd is:', r2_score(y_test, y_pred))
#print ('Cor is:', math.sqrt(r2_score(y_test, y_pred)))

# Prediction with best model

Load dataset

In [11]:
test = pd.read_csv("drySSN.csv")
test.head(4)

Unnamed: 0,FID,Eastings,Northings,LST,NDVI,NDMI
0,0,667759.1603,1410456.418,31.897301,0.064563,-0.080691
1,1,667970.2328,1410159.893,32.245602,0.173336,-0.09512
2,2,668100.4408,1410251.599,31.908501,0.096174,-0.063759
3,3,668075.4149,1410344.91,31.6427,0.174357,-0.080416


* Saving Coordinates
* Dropping unecessary columns

In [12]:
Easting = test.Eastings
Northing = test.Northings

test = test.drop(['Eastings', 'Northings', 'FID'], axis = 1)
test.head(4)

Unnamed: 0,LST,NDVI,NDMI
0,31.897301,0.064563,-0.080691
1,32.245602,0.173336,-0.09512
2,31.908501,0.096174,-0.063759
3,31.6427,0.174357,-0.080416


tst = test.values
print(tst.shape)

In [13]:
NoL = ForestModel.predict(test)
NoL

array([52.256, 47.618, 57.058, 49.04 , 50.914, 50.922, 53.606, 25.868,
       47.448, 50.914, 50.664, 50.182, 49.714, 50.914, 50.914, 50.182,
       49.464, 57.058, 44.37 , 50.672, 50.004, 50.914, 50.914, 50.914,
       50.914, 50.664, 50.664, 51.418, 50.914, 47.726, 50.272, 27.524,
       46.06 , 49.714, 49.714, 50.914, 57.804, 50.664, 50.914, 50.914,
       32.462, 50.664, 40.148, 67.558, 37.508, 30.434, 57.08 , 50.914,
       50.922, 60.07 , 30.05 , 30.148, 30.022, 50.914, 53.54 , 54.   ,
       49.464, 50.914, 50.914, 50.914, 57.258, 40.776, 50.672, 50.914,
       27.75 , 50.15 , 47.092, 50.914, 52.622, 50.914, 49.648, 50.914,
       50.914, 50.298, 50.004, 27.868, 57.734, 45.66 , 50.914, 25.208,
       49.714, 34.852, 41.312, 32.998, 50.672, 50.914, 71.69 , 50.914,
       50.914, 32.55 , 57.516, 47.238, 49.714, 49.47 , 52.15 , 49.464,
       50.914, 50.914, 41.66 , 29.714, 59.222, 67.816, 30.196, 57.996,
       31.524, 57.542, 53.122, 50.672, 54.842, 53.258, 32.448, 34.298,
      

In [14]:
stats.describe(NoL)

DescribeResult(nobs=460, minmax=(17.878, 78.332), mean=46.941, variance=88.82047838779957, skewness=-1.040713498270715, kurtosis=0.9262401863376972)

Adding Coordinates back to the predicted value

In [15]:
Easting = Easting.values
Northing = Northing.values

txt = {'Eastings' : Easting, 'Northings' : Northing, 'No_of_Larv' : NoL}
txt = pd.DataFrame(txt)
txt.describe()

Unnamed: 0,Eastings,Northings,No_of_Larv
count,460.0,460.0,460.0
mean,644415.861669,1385394.0,46.941
std,21510.078131,26502.75,9.424462
min,597125.1607,1334935.0,17.878
25%,621721.7958,1361346.0,47.0365
50%,655559.16245,1398171.0,50.664
75%,662318.908525,1407335.0,50.914
max,668335.4245,1412135.0,78.332


Save data to csv

In [16]:
txt.to_csv("Kebbi_DSS_Pred.csv")