# Model Calibration 

<b>Import packages and set working directory</b>

In [1]:
import sys
import os
import numpy as np
import pandas as pd
from time import time
#from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LassoCV
from sklearn import model_selection
from sklearn.model_selection import RepeatedKFold
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import seaborn as sb
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'plotly'

In [None]:
wdir = "/Users/emmanuel_mj/Documents/GitHub/mda_heatwaves-cameroon/DataPrep/Data"
os.chdir(wdir)

<b> Get data and Explore</b>

In [None]:
data_clean = pd.read_csv('MainData_Scaled.csv', sep="|", header=0)
data_clean1 = pd.read_csv('MainData_NotScaled.csv', sep="|", header=0)
data_clean2 = pd.read_csv('MainData_Scaled_OutliersRemoved.csv', sep="|", header=0)


In [None]:
data_clean.head()

In [None]:
data_clean1.head()

In [None]:
df_main = data_clean.loc[:,"CDD":"CITY_REL_WATER"]
df_main1 = data_clean1.loc[:,"CDD":"CITY_REL_WATER"]
df_main2 = data_clean2.loc[:,"CDD":"CITY_REL_WATER"]

In [None]:
# check for the null values
df_main.isna().sum()

In [None]:
df_main1.isna().sum()

In [None]:
#df_2 = df_main.drop(['URB_AREA_HINTER', 'GDP_PC_REAL_PPP','POP_TOT_GI'], axis=1)
#df_2.head()

In [None]:
df_main = df_main.where(pd.notna(df_main), df_main.mean(), axis="columns")
df_main1 = df_main1.where(pd.notna(df_main1), df_main1.mean(), axis="columns")
df_main2 = df_main2.where(pd.notna(df_main2), df_main2.mean(), axis="columns")

In [None]:
df_main1.isna().sum()

In [None]:
df_main1.corr()

In [None]:
fig = px.histogram(data_clean1, x="CDD")
fig.show()

In [None]:
fig = px.histogram(data_clean2, x="CDD")
fig.show()

pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['GDP_PC_REAL_PPP',
                                                                        'URB_AREA_HINTER',
                                                                        'POP_TOT_GI'
                                                                       ]),
                                              ('impute_FRAGMENTATION', SimpleImputer(strategy='mean'), ['FRAGMENTATION']),
                                              ('impute_T_Y0_14_SH_NAT', SimpleImputer(strategy='mean'), ['T_Y0_14_SH_NAT']),
                                              ('impute_T_Y15_64_SH_NAT', SimpleImputer(strategy='mean'), ['T_Y15_64_SH_NAT']),
                                              ('impute_T_Y65_MAX_SH_NAT', SimpleImputer(strategy='mean'), ['T_Y65_MAX_SH_NAT']),
                                              ('impute_PWM_EX_CORE', SimpleImputer(strategy='mean'), ['PWM_EX_CORE'])])

# Variable importance/selection

In [None]:
# create new arrays for variable importance scaled data with outliers
y_1 = df_main.loc[:,"CDD"]
X_1 = df_main.loc[:,"URB_AREA":"CITY_REL_WATER"]
X_1train, X_1test, y_1train, y_1test = train_test_split(X_1, y_1, random_state=0)

In [None]:
#lasso = LassoCV().fit(pre_process.fit_transform(X_1train), y_train)
#lasso_pipeline = Pipeline(steps=[('pre_processing',pre_process),
#                                ('lasso', LassoCV(cv=5, random_state=0))
#                                 ])
#lasso_pipeline.fit(X_1train,y_2train)


In [None]:
np.array(X_1train.columns)

In [None]:
lasso = LassoCV(cv=10, random_state=0,max_iter=10000).fit(X_1train, y_1train)
importance = np.abs(lasso.coef_)
feature_names = np.array(X_1train.columns)
plt.figure(figsize=(15,8))
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.xticks(rotation=90)
plt.show()

In [None]:
# create new arrays for variable importance scaled data with outliers removed
y_2 = df_main2.loc[:,"CDD"]
X_2 = df_main2.loc[:,"URB_AREA":"CITY_REL_WATER"]
X_2train, X_2test, y_2train, y_2test = train_test_split(X_2, y_2, random_state=0)

In [None]:
lasso_2 = LassoCV(cv=10, random_state=0,max_iter=10000).fit(X_2train, y_2train)
importance_2 = np.abs(lasso_2.coef_)
feature_names = np.array(X_2train.columns)
plt.figure(figsize=(15,8))
plt.bar(height=importance_2, x=feature_names)
plt.title("Feature importances via coefficients")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Array for calibration with outliers
y = df_main.loc[:,"CDD"]
X = df_main.loc[:,"URB_AREA":"CITY_REL_WATER"]
rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

# Array for calibration with outliers removed
y_1 = df_main2.loc[:,"CDD"]
X_1 = df_main2.loc[:,"URB_AREA":"CITY_REL_WATER"]
rng = np.random.RandomState(0)
X_1train, X_1test, y_1train, y_1test = train_test_split(X_1, y_1, random_state=rng)

# Models

In [None]:
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['T_Y0_14_SH_NAT',
                                                                        'URB_AREA',
                                                                        'T_Y65_MAX_SH_NAT',
                                                                        'POP_DEN'
                                                                       ])])

In [None]:
pre_process_2 = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['T_Y0_14_SH_NAT',
                                                                        'URB_AREA',
                                                                        'T_Y15_64_SH_NAT',
                                                                        'CITY_REL_ROADS',
                                                                        'TREECOVER_SHARE_CORE'
                                                                       ])])

In [None]:
#scale predictor variables
pca = PCA()

X_reduced = pca.fit_transform(pre_process.fit_transform(X_train))
                                

#define cross validation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

regr = LinearRegression()
mse = []

# Calculate MSE with only the intercept
score = -1*model_selection.cross_val_score(regr,
           np.ones((len(X_reduced),1)), y_train, cv=cv,
           scoring='neg_mean_squared_error').mean()    
mse.append(score)

# Calculate MSE using cross-validation, adding one component at a time
for i in np.arange(1, 6):
    score = -1*model_selection.cross_val_score(regr,
               X_reduced[:,:i], y_train, cv=cv, scoring='neg_mean_squared_error').mean()
    mse.append(score)
    
# Plot cross-validation results    
plt.plot(mse)
plt.xlabel('Number of Principal Components')
plt.ylabel('MSE')
plt.title('CDD')

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
#scale predictor variables
pca_2 = PCA()

X_1reduced = pca_2.fit_transform(pre_process_2.fit_transform(X_1train))
                                

#define cross validation method
cv_1 = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

regr_1 = LinearRegression()
mse_1 = []

# Calculate MSE with only the intercept
score_1 = -1*model_selection.cross_val_score(regr_1,
           np.ones((len(X_1reduced),1)), y_1train, cv=cv,
           scoring='neg_mean_squared_error').mean()    
mse_1.append(score_1)

# Calculate MSE using cross-validation, adding one component at a time
for i in np.arange(1, 6):
    score_1 = -1*model_selection.cross_val_score(regr_1,
               X_1reduced[:,:i], y_1train, cv=cv_1, scoring='neg_mean_squared_error').mean()
    mse_1.append(score)
    
# Plot cross-validation results    
plt.plot(mse_1)
plt.xlabel('Number of Principal Components')
plt.ylabel('MSE_1')
plt.title('CDD')

In [None]:
np.cumsum(np.round(pca_2.explained_variance_ratio_, decimals=4)*100)

In [None]:
## data with outliers
########################################################################
model_1 = RandomForestRegressor(max_depth=15,random_state=0)
model_2 = LinearRegression(fit_intercept=True)
model_3 = Ridge(alpha=5)
model_4 = Lasso(alpha=10)
model_5 = SVR(C=2.5, epsilon=0.5)
model_6 = GradientBoostingRegressor(random_state=0)

MSE = []
R2 = []
for mymodels in [model_1,model_2,model_3,model_4,model_5,model_6]:
    model_pipeline = Pipeline(steps=[('pre_processing',pre_process),
                                 ('model', mymodels)
                                 ])
    model_pipeline.fit(X_train,y_train)
    MSE.append(mean_squared_error(y_train,model_pipeline.predict(X_train))**0.5)
    R2.append(r2_score(y_train,model_pipeline.predict(X_train)))
    
print(np.round(MSE,2))   
print(np.round(R2,2))

In [None]:
model_pipeline.get_params()

In [None]:
#models = model_pipeline.steps[1][0]
model_pipeline.steps[1][1]

In [None]:
## data with outliers removed
############################################################################
model_1_1 = RandomForestRegressor(max_depth=15,random_state=0)
model_2_1 = LinearRegression(fit_intercept=True)
model_3_1 = Ridge(alpha=5)
model_4_1 = Lasso(alpha=10)
model_5_1 = SVR(C=2.5, epsilon=0.5)
model_6_1 = GradientBoostingRegressor(random_state=0)

MSE_1 = []
R2_1 = []
for mymodels_1 in [model_1_1,model_2_1,model_3_1,model_4_1,model_5_1,model_6_1]:
    model_pipeline_1 = Pipeline(steps=[('pre_processing',pre_process_2),
                                 ('model_1', mymodels_1)
                                 ])
    model_pipeline_1.fit(X_1train,y_1train)
    MSE_1.append(mean_squared_error(y_1train,model_pipeline_1.predict(X_1train))**0.5)
    R2_1.append(r2_score(y_1train,model_pipeline_1.predict(X_1train)))
    
print(np.round(MSE_1,2))   
print(np.round(R2_1,2))