In [16]:
import pandas as pd
import numpy as np
import matplotlib
from scipy import stats
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

Loading data as a dataframe.

In [17]:
san_francisco = pd.read_csv("./datasets/Flood_Health_Decintile_Sort.csv")
san_francisco = san_francisco[(np.abs(stats.zscore(san_francisco)) < 3).all(axis=1)] #Gets rid of outliers
san_francisco.head()

Unnamed: 0,Census Blockgroup,Children,Elderly,NonWhite,Poverty,Education,English,Elevation,SeaLevelRise,Precipitation,Diabetes,MentalHealth,Asthma,Disability,HousingViolations,Homeless,LivAlone,FloodHealthIndex,Decintiles,FloodHealthIndex_Quintiles
1,60750201001,0.079893,0.124,0.651,0.509706,0.724,0.232,7.83292,0.0,0.020516,16.0115,62.0535,10.5741,0.238,10.2406,15.8921,0.522,83.486,10,5
3,60750178021,0.084215,0.088,0.584,0.327362,0.843,0.119,-5.96629,0.064579,0.024166,14.9856,54.0699,9.5802,0.229,7.04366,19.2683,0.427,82.4925,10,5
6,60750201003,0.079893,0.124,0.651,0.509706,0.724,0.232,5.81684,0.0,0.028468,15.9985,61.8836,10.5438,0.238,9.68738,3.2195,0.522,81.506,10,5
9,60750201002,0.079893,0.124,0.650999,0.509706,0.724001,0.232,13.9179,0.0,0.013759,16.0125,62.05,10.5693,0.238,11.5778,3.2195,0.522001,80.8327,10,5
15,60750176015,0.109834,0.141,0.733,0.550862,0.806,0.117,10.0642,0.0,0.004025,16.0068,62.0689,10.5959,0.227,6.98324,19.2683,0.609,79.1461,10,5


Finding Significant Metrics

In [19]:
from scipy.stats import spearmanr
from scipy.stats import pearsonr
columns = list(san_francisco.columns)[1:-3]
fhi = san_francisco[['FloodHealthIndex']]

correlationType  = dict()
significant_metrics = {}
for column in columns:
    non_linear_corr, p1 = spearmanr(fhi, san_francisco[column])
    linear_corr, p2 = pearsonr(san_francisco[column], fhi)
    if abs(non_linear_corr) > abs(linear_corr[0]):
        larger_corr = non_linear_corr
        correlationType.update({column: "nonlinear"})
    else:
        larger_corr = linear_corr[0]
        correlationType.update({column: "linear"})
    
    if abs(larger_corr) >= .3:
        significant_metrics[column] = larger_corr

    
    print(f"{column}: {larger_corr} ({correlationType[column]})")
significant_metrics

Children: -0.2857895487684715 (linear)
Elderly: -0.07932683793647982 (nonlinear)
NonWhite: 0.42777884442768427 (nonlinear)
Poverty: 0.6940349145675028 (linear)
Education: -0.5185598506634179 (linear)
English: 0.48954459987910764 (nonlinear)
Elevation: -0.6248862353664245 (linear)
SeaLevelRise: 0.20161734735146142 (linear)
Precipitation: 0.30381718961991616 (linear)
Diabetes: 0.5999321498483321 (linear)
MentalHealth: 0.6330094331732357 (linear)
Asthma: 0.641611209836375 (linear)
Disability: 0.5759663051153813 (linear)
HousingViolations: 0.22729048797961457 (linear)
Homeless: 0.5297459087845907 (nonlinear)
LivAlone: 0.2508361644595813 (linear)


{'NonWhite': 0.42777884442768427,
 'Poverty': 0.6940349145675028,
 'Education': -0.5185598506634179,
 'English': 0.48954459987910764,
 'Elevation': -0.6248862353664245,
 'Precipitation': 0.30381718961991616,
 'Diabetes': 0.5999321498483321,
 'MentalHealth': 0.6330094331732357,
 'Asthma': 0.641611209836375,
 'Disability': 0.5759663051153813,
 'Homeless': 0.5297459087845907}

In [20]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def model_permutation_importance(clf, X_test, y_test, significant_metrics):
    perm_importance = permutation_importance(clf, X_test, y_test)

    feature_names = list(significant_metrics.keys())
    features = np.array(feature_names)

    sorted_idx = perm_importance.importances_mean.argsort()
    plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
    plt.xlabel("Permutation Importance")

X = san_francisco[list(significant_metrics.keys())] 
y = san_francisco[['Decintiles']] 
accuracy = []
for i in range(0, 1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    clf = LinearDiscriminantAnalysis()
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train) 
    X_test = scaler.transform(X_test) 
    clf.fit(X_train, np.ravel(y_train,order='C'))
    y_pred = clf.predict(X_test)
    accuracy.append(metrics.accuracy_score(y_test, y_pred) + metrics.accuracy_score(y_test, [x+1 for x in y_pred]) + metrics.accuracy_score(y_test, [x-1 for x in y_pred]))
print("Accuracy:",sum(accuracy)/len(accuracy))


Accuracy: 0.7689599999999966
