# Model Calibration 

<b>Import packages and set working directory</b>

In [1]:
import sys
import os
import numpy as np
import pandas as pd
from time import time
#from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LassoCV
from sklearn import model_selection
from sklearn.model_selection import RepeatedKFold
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import seaborn as sb
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
%matplotlib inline

<b> Get data and Explore</b>

In [2]:
data_clean = pd.read_csv('DataPrep/MainData_Scaled.csv', sep="|", header=0)
data_clean1 = pd.read_csv('DataPrep/MainData_NotScaled.csv', sep="|", header=0)
data_clean2 = pd.read_csv('DataPrep/MainData_Scaled_OutliersRemoved.csv', sep="|", header=0)


FileNotFoundError: [Errno 2] No such file or directory: 'DataPrep/MainData_Scaled.csv'

In [None]:
data_clean.head()

In [None]:
data_clean1.head()

In [None]:
df_main = data_clean.loc[:,"CDD":"CITY_REL_WATER"]
df_main1 = data_clean1.loc[:,"CDD":"CITY_REL_WATER"]
df_main2 = data_clean2.loc[:,"CDD":"CITY_REL_WATER"]

In [None]:
# check for the null values
df_main.isna().sum()

In [None]:
df_main1.isna().sum()

In [None]:
#df_2 = df_main.drop(['URB_AREA_HINTER', 'GDP_PC_REAL_PPP','POP_TOT_GI'], axis=1)
#df_2.head()

In [None]:
df_main = df_main.where(pd.notna(df_main), df_main.mean(), axis="columns")
df_main1 = df_main1.where(pd.notna(df_main1), df_main1.mean(), axis="columns")
df_main2 = df_main2.where(pd.notna(df_main2), df_main2.mean(), axis="columns")

In [None]:
df_main1.isna().sum()

In [None]:
df_main1.corr()

In [None]:
fig = px.histogram(data_clean1, x="CDD")
fig.show()

In [None]:
fig = px.histogram(data_clean2, x="CDD")
fig.show()

pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['GDP_PC_REAL_PPP',
                                                                        'URB_AREA_HINTER',
                                                                        'POP_TOT_GI'
                                                                       ]),
                                              ('impute_FRAGMENTATION', SimpleImputer(strategy='mean'), ['FRAGMENTATION']),
                                              ('impute_T_Y0_14_SH_NAT', SimpleImputer(strategy='mean'), ['T_Y0_14_SH_NAT']),
                                              ('impute_T_Y15_64_SH_NAT', SimpleImputer(strategy='mean'), ['T_Y15_64_SH_NAT']),
                                              ('impute_T_Y65_MAX_SH_NAT', SimpleImputer(strategy='mean'), ['T_Y65_MAX_SH_NAT']),
                                              ('impute_PWM_EX_CORE', SimpleImputer(strategy='mean'), ['PWM_EX_CORE'])])

# Variable importance/selection

In [None]:
# create new arrays for variable importance scaled data with outliers
y_1 = df_main.loc[:,"CDD"]
X_1 = df_main.loc[:,"URB_AREA":"CITY_REL_WATER"]
X_1train, X_1test, y_1train, y_1test = train_test_split(X_1, y_1, random_state=0)

In [None]:
#lasso = LassoCV().fit(pre_process.fit_transform(X_1train), y_train)
#lasso_pipeline = Pipeline(steps=[('pre_processing',pre_process),
#                                ('lasso', LassoCV(cv=5, random_state=0))
#                                 ])
#lasso_pipeline.fit(X_1train,y_2train)


In [None]:
np.array(X_1train.columns)

In [None]:
lasso = LassoCV(cv=10, random_state=0,max_iter=10000).fit(X_1train, y_1train)
importance = np.abs(lasso.coef_)
feature_names = np.array(X_1train.columns)
plt.figure(figsize=(15,8))
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.xticks(rotation=90)
plt.show()

In [None]:
# create new arrays for variable importance scaled data with outliers removed
y_2 = df_main2.loc[:,"CDD"]
X_2 = df_main2.loc[:,"URB_AREA":"CITY_REL_WATER"]
X_2train, X_2test, y_2train, y_2test = train_test_split(X_2, y_2, random_state=0)

In [None]:
lasso_2 = LassoCV(cv=10, random_state=0,max_iter=10000).fit(X_2train, y_2train)
importance_2 = np.abs(lasso_2.coef_)
feature_names = np.array(X_2train.columns)
plt.figure(figsize=(15,8))
plt.bar(height=importance_2, x=feature_names)
plt.title("Feature importances via coefficients")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Array for calibration with outliers
y = df_main.loc[:,"CDD"]
X = df_main.loc[:,"URB_AREA":"CITY_REL_WATER"]
rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

# Array for calibration with outliers removed
y_1 = df_main2.loc[:,"CDD"]
X_1 = df_main2.loc[:,"URB_AREA":"CITY_REL_WATER"]
rng = np.random.RandomState(0)
X_1train, X_1test, y_1train, y_1test = train_test_split(X_1, y_1, random_state=rng)

# Models

In [None]:
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['T_Y0_14_SH_NAT',
                                                                        'URB_AREA',
                                                                        'T_Y65_MAX_SH_NAT',
                                                                        'POP_DEN'
                                                                       ])])

In [None]:
pre_process_2 = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['T_Y0_14_SH_NAT',
                                                                        'URB_AREA',
                                                                        'T_Y15_64_SH_NAT',
                                                                        'CITY_REL_ROADS',
                                                                        'TREECOVER_SHARE_CORE'
                                                                       ])])

In [None]:
 cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)


In [None]:
 cv_1 = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
class Scale_predictor_variables:
    def __init__(self,X_train,y_train,pre_process,cv):
        self.X_train = X_train
        self.y_train = y_train
        self.pre_process = pre_process
        self.cv = cv
    def Plot_cross_validation_results(self):
        pca = PCA()

        X_reduced = pca.fit_transform(self.pre_process.fit_transform(self.X_train))
                                

        #define cross validation method
        #cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

        regr = LinearRegression()
        mse = []

        # Calculate MSE with only the intercept
        score = -1*model_selection.cross_val_score(regr,
           np.ones((len(X_reduced),1)), self.y_train, cv=self.cv,
           scoring='neg_mean_squared_error').mean()    
        mse.append(score)

        # Calculate MSE using cross-validation, adding one component at a time
        for i in np.arange(1, 6):
            score = -1*model_selection.cross_val_score(regr,
               X_reduced[:,:i], self.y_train, cv=self.cv, scoring='neg_mean_squared_error').mean()
            mse.append(score)
    
        # Plot cross-validation results    
        plt.plot(mse)
        plt.xlabel('Number of Principal Components')
        plt.ylabel('MSE')
        plt.title('CDD')
        variance_ratio = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
        print(variance_ratio)
        
            

In [None]:
scale_predictor_variables = Scale_predictor_variables(X_train,y_train,pre_process,cv)

In [None]:
scale_predictor_variables.Plot_cross_validation_results()

In [None]:
scale_predictor_variables = Scale_predictor_variables(X_1train,y_1train,pre_process_2,cv_1)

In [None]:
scale_predictor_variables.Plot_cross_validation_results()

In [None]:
class Scoring:
    def __init__(self,pre_process,X_train,y_train):
        self.pre_process = pre_process
        self.X_train = X_train
        self.y_train = y_train
    def Score(self):
        ## data 
        ########################################################################
        model_1 = RandomForestRegressor(max_depth=15,random_state=0)
        model_2 = LinearRegression(fit_intercept=True)
        model_3 = Ridge(alpha=5)
        model_4 = Lasso(alpha=10)
        model_5 = SVR(C=2.5, epsilon=0.5)
        model_6 = GradientBoostingRegressor(random_state=0)

        MSE = []
        R2 = []
        for mymodels in [model_1,model_2,model_3,model_4,model_5,model_6]:
            model_pipeline = Pipeline(steps=[('pre_processing',self.pre_process),
                                 ('model', mymodels)
                                 ])
            model_pipeline.fit(self.X_train,self.y_train)
            MSE.append(mean_squared_error(self.y_train,model_pipeline.predict(self.X_train))**0.5)
            R2.append(r2_score(self.y_train,model_pipeline.predict(self.X_train)))
    
        print(np.round(MSE,2))   
        print(np.round(R2,2))

In [None]:
Scoring1 = Scoring(pre_process,X_train,y_train)

In [None]:
Scoring1.Score()

In [None]:
Scoring2 = Scoring(pre_process_2,X_1train,y_1train)

In [None]:
Scoring2.Score()

In [None]:
class Model_select:
    def __init__(self,X_train,y_train,X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
    def model_selection(self):
        models = []
        models_1 = ["Ridge","Lasso","LinearRegression"]
        models_2 = ["RandomForestRegressor","GradientBoostingRegressor"]
        model_3 = ["SVR"]
        models += models_1 + models_2 + model_3
        models_dictionary = {"Ridge":Ridge(),"Lasso":Lasso(),"LinearRegression":LinearRegression(fit_intercept=True),
                             "RandomForestRegressor":RandomForestRegressor(random_state=0),"GradientBoostingRegressor":GradientBoostingRegressor(random_state=0),
                            "SVR":SVR(epsilon=0.5)}
        models_score = {}
        
        # Tuning of parameters for regression by cross-validation
        K = 5               # Number of cross valiations
        
        for model in models:
            if model in models_1:
                
                pipe = Pipeline([
                ('scaler', StandardScaler()),
                ('reduce_dim', PCA()),
                ('regressor', models_dictionary[model])
                ])
                pipe = pipe.fit(self.X_train, self.y_train)
                n_features_to_test = np.arange(1, 13)
                alpha_to_test = 2.0**np.arange(-6, +6)
            
                if model == "LinearRegression":
                    params = {'reduce_dim__n_components': n_features_to_test,
                    'scaler' : [StandardScaler(), RobustScaler()]}
                else:
                    params = {'reduce_dim__n_components': n_features_to_test,
                    'regressor__alpha': alpha_to_test,
                    'scaler' : [StandardScaler(), RobustScaler()]}
                gridsearch = GridSearchCV(pipe, params, verbose=1,cv = K).fit(self.X_train, self.y_train)
                
            elif model in models_2:
                
                if model == "RandomForestRegressor":
                    
                    model_estimator =  models_dictionary[model]
                    params={'n_estimators':[20,30,40,60,100], 'max_depth': 
                    [5,10,15,20],'max_features':[2,5,8]}
                    
                     
                else:
                    model_estimator =  models_dictionary[model]
                    
                    params = {'learning_rate': [0.01,0.02,0.03,0.04],
                    'subsample'    : [0.9, 0.5, 0.2, 0.1],
                    'n_estimators' : [100,500,1000, 1500],
                    'max_depth'    : [4,6,8,10]
                     }
                
                gridsearch = GridSearchCV(estimator = model_estimator,param_grid = params, cv = K, n_jobs=-1).fit(self.X_train, self.y_train)
            else:
                parameters = {'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],'C': [1, 2.5, 5, 10, 100, 1000, 10000]}
                gridsearch = GridSearchCV(models_dictionary[model], parameters, cv = K).fit(self.X_train, self.y_train)
             
            print(" Results from Grid Search:",model)
            print("\n The best estimator across ALL searched params:\n",gridsearch.best_estimator_)
            print("\n The best score across ALL searched params:\n",gridsearch.best_score_)
            print("\n The best parameters across ALL searched params:\n",gridsearch.best_params_)
            print('\n Final score is: ', gridsearch.score(self.X_test, self.y_test))
            print("")
            models_score[model] = gridsearch.score(self.X_test, self.y_test)
        self.models_score = models_score

        

In [None]:
model_select = Model_select(X_train,y_train,X_test, y_test)

In [None]:
model_select.model_selection()

In [None]:
model_select.models_score