# Notebook to make Style estimations with Linear Regression

In [None]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.api as sm
from scipy import stats
import itertools
import time

%matplotlib inline
plt.style.use('ggplot')

### Importing style metrics and IMDs

In [None]:
metrics = pd.read_csv("../data/london_metrics.csv")

In [None]:
imd_per_ward = pd.read_csv("../data/imd_per_ward.csv")[['WD17CD','Index of Multiple Deprivation (IMD) Score','Education, Skills and Training Score','Employment Score (rate)','Income Score (rate)']]
imd_per_ward = imd_per_ward.rename(columns={"Index of Multiple Deprivation (IMD) Score": "IMD", "Education, Skills and Training Score" : "IMD_Edu", 'Employment Score (rate)' : 'IMD_Emp', 'Income Score (rate)': 'IMD_Inc'})

In [None]:
metrics_imd = metrics.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['ward','WD17CD'])

In [None]:
X = metrics_imd.drop(['IMD','IMD_Edu','IMD_Emp','IMD_Inc'],axis=1)
y = metrics_imd[['IMD']]

### Normalizing the inputs

In [None]:
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))
X_scaled.head()

### Doing forward stepwise selection

In [None]:
X2 = sm.add_constant(X_scaled)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

In [None]:
def stepwiseSelection(X, y):
    features = list(X.columns)
    overall_best_score = 0.0
    features_slt = []
    done = False
    total_features = len(features)
    counter = 0
    
    while (done == False):
        best_score = 0.0
        best_score_ft = None
        
        for i in features:
            X_select = X_scaled[features_slt + [i]]
            X2 = sm.add_constant(X_select)
            est = sm.OLS(y, X2)
            est2 = est.fit()
            if (est2.rsquared_adj > best_score):
                best_score = est2.rsquared_adj
                best_score_ft = i
        
        if (best_score > overall_best_score):
            features.remove(best_score_ft)
            features_slt.append(best_score_ft)
            overall_best_score = best_score
            counter += 1
        else:
            done = True
        
        print("{}/{}".format(counter, total_features), end='\r')
    
    return features_slt

In [None]:
selected_features = stepwiseSelection(X_scaled, y)
print("{} selected features :".format(len(selected_features)))
print(selected_features)

### Doing estimations

In [None]:
X_ = X_scaled.copy()
y_ = y.copy()

In [None]:
def cross_validate(n, features):
    fit_rsquareds = []
    RMSEs = []
    MAEs = []
    SCorrs = []
    pVals = []
    print("\nResults ({} features) :".format(len(features)))
    for i in range(n):
        # Splitting the data
        X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2)
        X_train1 = X_train[features].copy()
        X_test1 = X_test[features].copy()
        # Fitting the model
        X2 = sm.add_constant(X_train1)
        est = sm.OLS(y_train, X2)
        est2 = est.fit()
        fit_rsquareds.append(est2.rsquared_adj)
        # Making predictions
        X2 = sm.add_constant(X_test1)
        y_pred = est2.predict(X2)
        # Storing the results
        RMSEs.append((mean_squared_error(y_test, y_pred, squared=False)))
        MAEs.append((mean_absolute_error(y_test, y_pred)))
        SCorrs.append(stats.spearmanr(y_test.to_numpy().reshape(157,), y_pred.to_numpy())[0])
        pVals.append(stats.spearmanr(y_test.to_numpy().reshape(157,), y_pred.to_numpy())[1])
    # Storing and printing the results
    df = pd.DataFrame()
    df['Adjusted R2'] = fit_rsquareds
    df['RMSE'] = RMSEs
    df['MAE'] = MAEs
    df['Spearman Correlation'] = SCorrs
    print()
    print("Mean Adjusted R2 when fitting : {}".format(np.mean(fit_rsquareds)))
    print()
    print("Mean RMSE : {}".format(np.mean(RMSEs)))
    print("Mean MAE : {}".format(np.mean(MAEs)))
    print("Mean Spearman Correlation : {}".format(np.mean(SCorrs)))
    print("Mean P-Value : {}".format(np.mean(pVals)))
    
    return df

In [None]:
results = cross_validate(200, selected_features)

### Saving the results to .csv

In [None]:
results.to_csv("../data/temp_results/london_style_linear.csv", index=False)