# Notebook to make Style estimations with Random Forest

In [None]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
import itertools
import time

%matplotlib inline
plt.style.use('ggplot')

### Importing style metrics and IMDs

In [None]:
metrics = pd.read_csv("../data/london_metrics.csv")

In [None]:
imd_per_ward = pd.read_csv("../data/imd_per_ward.csv")[['WD17CD','Index of Multiple Deprivation (IMD) Score','Education, Skills and Training Score','Employment Score (rate)','Income Score (rate)']]
imd_per_ward = imd_per_ward.rename(columns={"Index of Multiple Deprivation (IMD) Score": "IMD", "Education, Skills and Training Score" : "IMD_Edu", 'Employment Score (rate)' : 'IMD_Emp', 'Income Score (rate)': 'IMD_Inc'})

In [None]:
metrics_imd = metrics.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['ward','WD17CD'])

In [None]:
X = metrics_imd.drop(['IMD','IMD_Edu','IMD_Emp','IMD_Inc'],axis=1)
y = metrics_imd[['IMD']]

### Doing estimations

In [None]:
X_ = X.copy()
y_ = y.copy()

In [None]:
def cross_validate(inputs, labels, n, params):
    RMSEs = []
    MAEs = []
    SCorrs = []
    for i in range(n):
        X_train, X_valid, y_train, y_valid = train_test_split(inputs, labels, test_size=0.2)
        
        rf = RandomForestRegressor(n_estimators=params[0], max_depth=params[1])
        rf.fit(X_train, y_train['IMD'])
        y_pred = rf.predict(X_valid)

        RMSEs.append((mean_squared_error(y_valid, y_pred, squared=False)))
        MAEs.append((mean_absolute_error(y_valid, y_pred)))
        SCorrs.append((stats.spearmanr(y_valid.to_numpy().reshape(y_valid.shape[0],), y_pred)[0]))
        
    df = pd.DataFrame()
    df['RMSE'] = RMSEs
    df['MAE'] = MAEs
    df['Spearman Correlation'] = SCorrs
    
    print("Mean RMSE : {}".format(np.mean(RMSEs)))
    print("Mean MAE : {}".format(np.mean(MAEs)))
    print("Mean Spearman Correlation : {}".format(np.mean(SCorrs)))
    
    return df

In [None]:
estimators = [20, 50, 100, 200]
depths = [10, 20, 30, None]
for e in estimators:
    for d in depths:
        print("Model {}".format(d))
        cross_validate(X_, y_, 200, [e, d])
        print()

In [None]:
# Put selected hyperparameters
results = cross_validate(X_, y_, 200, [100, None])

### Saving the results to .csv

In [None]:
results.to_csv("../data/temp_results/london_style_rfr.csv", index=False)