In [1]:
############################################################
### Code to run ridge regression model for one city only ###
############################################################

CITY_COUNTRY = 'BIKANER_INDIA'

#Read in the packages to use
import numpy as np
import pandas as pd
import math
import datetime
import os
import pathlib
import matplotlib.pyplot as plt
import earthpy as et
import pandas as pd
import pickle
import random                                                        #for test city selection

#for displaying the cities on map
#import cartopy
#import cartopy.crs as ccrs
#import matplotlib as mpl
#from matplotlib.ticker import ScalarFormatter
#import matplotlib.ticker as ticker                                   #for setting axes ticks to whole numbers

#models
#from sklearn.linear_model import LinearRegression 
import statsmodels.api as sm                                          #stats models to get linear reg p-values
from statsmodels.tools.eval_measures import rmse                      #calc rmse from stats models
import itertools                                                      #for calculating possible combinations of variables
from sklearn.preprocessing import PolynomialFeatures                  #for polynomial regression
from sklearn.pipeline import make_pipeline                            #pipeline to create polynomial regression
from sklearn.linear_model import Ridge

#use grid search to find hyperparameters (pg 73 Geron) RFR
from sklearn.model_selection import GridSearchCV                      #for cross validation
from sklearn.feature_selection import RFE                             #for selecting features for the linear reg
from sklearn.model_selection import cross_val_score                   #for cross validation
from sklearn.model_selection import KFold                            
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler                      #for normalising the data
from sklearn.metrics import r2_score                                  #metrics for assessing model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

#for plotting
from matplotlib import ticker 
from matplotlib.ticker import MaxNLocator                             #to keep whole numbers on the x-axis
from matplotlib.lines import Line2D                                   #for manual legend creation

import math
#Read in the data
#read in the city info table
#os.chdir(os.path.join(et.io.HOME, 'Documents', 'Python_Scripts', 'PROJECT', 'CITY_ADD_2022_09'))
#CITY_COUNTRY_lat_lon = pd.read_excel('CITY_COUNTRY_lat_lon.xlsx', index_col=None)
os.chdir(os.path.join(et.io.HOME, 'Documents', 'Python_Scripts', 'PROJECT', 'MEAN_SUHI'))
CITY_COUNTRY_lat_lon = pd.read_excel('CITY_COUNTRY_lat_lon_mean.xlsx', index_col=None)

#read the table with all variables in as pickle
os.chdir(os.path.join(et.io.HOME, 'Documents', 'Python_Scripts', 'PROJECT','UHI_Project_Pickle_Files','All_cities'))   
with open('aqua_all_monthly_data_df2.pkl', 'rb') as f:
    all_monthly_data_df = pickle.load(f)

# filter out data for the required city

CITY_COUNTRY_lat_lon = CITY_COUNTRY_lat_lon[CITY_COUNTRY_lat_lon.CITY_COUNTRY == CITY_COUNTRY].reset_index(drop = True)
all_monthly_data_df = all_monthly_data_df[all_monthly_data_df.CITY_COUNTRY == CITY_COUNTRY].reset_index(drop = True)
    
# Additional variables
# add evi difference variable
#all_monthly_data_df['EVI_D'] = all_monthly_data_df['EVI_U'] - all_monthly_data_df['EVI_R'] 

all_monthly_data_df['LOG_AREA'] = np.log10(all_monthly_data_df.Monthly_Area.values.astype(float))
# add sqrt values of climate variables (evi range is -1 to 1)
all_monthly_data_df['sqrt_EF'] = np.sqrt(all_monthly_data_df.EF.values.astype(float))
all_monthly_data_df['sqrt_RH'] = np.sqrt(all_monthly_data_df.RH.values.astype(float))
all_monthly_data_df['sqrt_TP'] = np.sqrt(all_monthly_data_df.TP.values.astype(float))
all_monthly_data_df['sqrt_T2M'] = np.sqrt(all_monthly_data_df.T2M.values.astype(float))
all_monthly_data_df['sqrt_SSR'] = np.sqrt(all_monthly_data_df.SSR.values.astype(float))
# add one to the sqrt (evi variables +1)
#all_monthly_data_df['sqrt_EVI_R_p1'] = np.sqrt(all_monthly_data_df.EVI_R.values.astype(float)+1)
#all_monthly_data_df['sqrt_EVI_U_p1'] = np.sqrt(all_monthly_data_df.EVI_U.values.astype(float)+1)

#all_monthly_data_df['cube_EVI_D'] = pow(all_monthly_data_df.EVI_D.values.astype(float), 3)

all_monthly_data_df['CROPLAND'] = all_monthly_data_df['CROPLAND_RAIN'] + all_monthly_data_df['CROPLAND_IRR']
# elevation diff
all_monthly_data_df['ELEVATION_D'] = all_monthly_data_df['ELEVATION_U'] - all_monthly_data_df['ELEVATION_R']
    

#define the overpass time 
overpass_time = '13:30'

#define the predictor and target values for training and test data
#what predictors to use?
predictor_variables_all = ['EF', 'RH', 'TP','T2M','SSR','EVI_U','EVI_R','EVI_D',
                           'ECC', 'LOG_AREA', 'sqrt_EF','sqrt_RH','sqrt_TP','sqrt_T2M',
                           'sqrt_SSR','sqrt_EVI_R_p1','sqrt_EVI_U_p1','sqrt_EVI_D_p1','CROPLAND_RAIN',
                           'CROPLAND_IRR','GRASSLAND','BARE','WATER','cube_EVI_D',
                         'ELEVATION_D','ROUGHNESS_LENGTH_R']


predictor_variables_2 = ['EF','SSR', 'ECC', 'LOG_AREA', 
                         'sqrt_EF','sqrt_SSR', #'sqrt_EVI_R_p1','sqrt_EVI_U_p1','cube_EVI_D','EVI_U','EVI_R','EVI_D',
                         'ELEVATION_D','ROUGHNESS_LENGTH_R']


#'GRASSLAND',
predictor_variables = predictor_variables_2

#target_variable = 'SUHI_PEAK_GSA'
#target_variable = 'SUHI_PEAK_QUANTILE'
target_variable = 'SUHI_MEAN'
#target_variable = 'SUHI_FP'

#clean the data - first remove columns which are not the target or predictor variables, then remove nans
variables = predictor_variables.copy()
variables.append(target_variable)
variables.append('Overpass')
variables.append('month')
variables.append('year')
all_monthly_data_df2 = all_monthly_data_df[variables]
all_monthly_data_df2 = all_monthly_data_df2.dropna().reset_index(drop = True)

all_monthly_data_df3 = all_monthly_data_df[['EF','SSR','EVI_U','EVI_R','EVI_D', 'ECC', 'LOG_AREA', 'sqrt_EF','sqrt_SSR',
            'sqrt_EVI_R_p1','sqrt_EVI_U_p1', 'CROPLAND','WATER','BARE','GRASSLAND','ROUGHNESS_LENGTH_R','month','City',
                                            'Overpass']].dropna().reset_index(drop = True)


AttributeError: 'DataFrame' object has no attribute 'EF'

In [None]:
#create training and test datasets
#create the datasets
training_data = all_monthly_data_df2[((all_monthly_data_df['year'] < 2016) 
                                    & (all_monthly_data_df2['Overpass'] == overpass_time))].copy()
test_data = all_monthly_data_df2[((all_monthly_data_df['year'] >= 2016)
                                  & (all_monthly_data_df2['Overpass'] == overpass_time))].copy()
 
print('Train_percent', 100* len(training_data)/len(all_monthly_data_df2[all_monthly_data_df2['Overpass'] == overpass_time]))
print('Test_percent', 100* len(test_data)/len(all_monthly_data_df2[all_monthly_data_df2['Overpass'] == overpass_time]))
print('Total Datapoints',len(all_monthly_data_df2[all_monthly_data_df2['Overpass'] == overpass_time]))

#split the data into training and test
X_train = training_data[predictor_variables]  #predictors
y_train = training_data[target_variable]      #target

X_test = test_data[predictor_variables]       #predictors
y_test = test_data[target_variable]           #target

# get polynomail terms and interactions
poly = PolynomialFeatures(2)
X_train_poly = pd.DataFrame(poly.fit_transform(X_train), columns = poly.get_feature_names_out(X_train.columns))
X_train_poly = X_train_poly.drop('1', axis=1)

X_test_poly = pd.DataFrame(poly.fit_transform(X_test), columns = poly.get_feature_names_out(X_test.columns))
X_test_poly = X_test_poly.drop('1', axis=1)
