In [14]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
data = pd.read_csv('output/merged.csv')
data

Unnamed: 0.1,Unnamed: 0,2013 CBSA,CBSA Code,Population,Age Adjusted Accidental Death Rate per 100000,Median AQI,Estimate; Total,Year,Smoking rate,Binge drinking rate,"Property crime rate per 100,000","Violent crime rate per 100,000"
0,0,"Akron, OH",10420.0,12625850.0,36.8,37,703505,2015,27.80,20.38,2541.0,277.3
1,1,"Albany-Schenectady-Troy, NY",10580.0,15466849.0,22.5,40,886188,2017,15.77,15.62,1954.7,291.2
2,2,"Atlanta-Sandy Springs-Roswell, GA",12060.0,90449335.0,38.0,42,5882450,2017,14.93,13.61,2865.7,367.6
3,3,"Baltimore-Columbia-Towson, MD",12580.0,48206323.0,27.3,44,2808175,2017,15.42,17.11,2732.9,782.5
4,4,"Baton Rouge, LA",12940.0,13890020.0,48.1,46,834159,2017,21.69,20.49,3937.5,546.8
5,5,"Buffalo-Cheektowaga-Niagara Falls, NY",15380.0,20634485.0,28.1,41,1136856,2017,20.38,26.45,2185.8,375.7
6,6,"Cedar Rapids, IA",16300.0,4545321.0,35.8,43,270293,2017,18.08,22.91,2410.0,217.4
7,7,"Charlotte-Concord-Gastonia, NC-SC",16740.0,37458020.0,41.8,44,2525305,2017,14.57,17.86,2800.4,416.2
8,8,"Chattanooga, TN-GA",16860.0,9249007.0,45.5,40,554876,2016,21.11,11.42,3394.1,557.3
9,9,"Cincinnati, OH-KY-IN",17140.0,37467088.0,44.2,45,2180746,2017,23.49,21.44,2440.8,264.9


In [4]:
from sklearn.preprocessing import MinMaxScaler

#Select relevant features
features = data[[
    'Age Adjusted Accidental Death Rate per 100000',
    'Median AQI',
    'Smoking rate',
    'Binge drinking rate',
    'Property crime rate per 100,000',
    'Violent crime rate per 100,000'
]]

#Normalize data
features_normalized = (features - features.mean()) / features.std()
# mms = MinMaxScaler()
# features_normalized = pd.DataFrame(data=mms.fit_transform(features),columns=list(features))

#Multiply features negatively correlated with city safety by -1.
#This is so when we add them in, we're actually dragging the safety index down.
airq = features_normalized[['Median AQI']]
features_normalized.drop(columns=['Median AQI'],inplace=True)
features_normalized = features_normalized * -1
features_normalized = pd.concat([data['2013 CBSA'], features_normalized, airq], axis=1)
features_normalized

Unnamed: 0,2013 CBSA,Age Adjusted Accidental Death Rate per 100000,Smoking rate,Binge drinking rate,"Property crime rate per 100,000","Violent crime rate per 100,000",Median AQI
0,"Akron, OH",0.452605,-1.902641,-0.901331,0.106833,0.678837,-0.105737
1,"Albany-Schenectady-Troy, NY",2.084266,0.475151,0.347179,0.860566,0.600270,0.166159
2,"Atlanta-Sandy Springs-Roswell, GA",0.315683,0.641182,0.874386,-0.310593,0.168431,0.347423
3,"Baltimore-Columbia-Towson, MD",1.536576,0.544330,-0.043636,-0.139869,-2.176723,0.528687
4,"Baton Rouge, LA",-0.836749,-0.694968,-0.930183,-1.688473,-0.844468,0.709951
5,"Buffalo-Cheektowaga-Niagara Falls, NY",1.445294,-0.436039,-2.493443,0.563470,0.122647,0.256791
6,"Cedar Rapids, IA",0.566707,0.018568,-1.564929,0.275244,1.017412,0.438055
7,"Charlotte-Concord-Gastonia, NC-SC",-0.117906,0.712337,-0.240355,-0.226645,-0.106272,0.528687
8,"Chattanooga, TN-GA",-0.540084,-0.580328,1.448806,-0.989891,-0.903817,0.166159
9,"Cincinnati, OH-KY-IN",-0.391751,-1.050747,-1.179360,0.235648,0.748926,0.619319


In [5]:
#Run principal component analysis against data
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=False)
features_pca = pca.fit_transform(features_normalized.iloc[:,1:])

In [6]:
pca.explained_variance_ratio_

array([0.37649217, 0.19748192])

In [7]:
import matplotlib.pyplot as plt
plt.scatter(features_pca[:,0], features_pca[:,1])

<matplotlib.collections.PathCollection at 0x1a71b6518d0>

How we're weighting our different values:
- 'Age Adjusted Accidental Death Rate per 100000' : medium
- 'Smoking rate' : low
- 'Binge drinking rate' : medium
- 'Property crime rate per 100,000' : high
- 'Violent crime rate per 100,000' : high
- 'Median AQI' : low

In [8]:
weights = [10,5,15,30,40,0]
features_normalized['Scores'] = pd.Series([100 + np.dot(weights, row[1:]) for index, row in features_normalized.iterrows()])
features_normalized['Scores'] = MinMaxScaler().fit_transform(features_normalized[['Scores']]) * 100

In [16]:
features_normalized.sort_values('Scores')

Unnamed: 0,2013 CBSA,Age Adjusted Accidental Death Rate per 100000,Smoking rate,Binge drinking rate,"Property crime rate per 100,000","Violent crime rate per 100,000",Median AQI,Scores
32,"Memphis, TN-MS-AR",-0.677006,-0.374766,1.265201,-2.062832,-4.357395,-0.196369,0.0
29,"Little Rock-North Little Rock-Conway, AR",-0.197777,-1.313628,-0.319042,-1.882209,-2.267726,0.075527,16.01828
36,"Mobile, AL",-1.190466,-1.489542,0.397015,-2.482572,-1.252566,-1.918377,21.54156
4,"Baton Rouge, LA",-0.836749,-0.694968,-0.930183,-1.688473,-0.844468,0.709951,28.344331
55,"Shreveport-Bossier City, LA",-0.665596,-1.351183,0.399638,-1.553359,-1.184173,0.075527,30.512215
61,"Tucson, AZ",-0.916621,0.348652,-0.138061,-1.676517,-0.574286,0.075527,35.078002
34,"Milwaukee-Waukesha-West Allis, WI",-0.688416,0.72815,-1.07182,0.100534,-1.600185,-0.015105,35.67831
62,"Tulsa, OK",-1.38444,-0.351048,0.585865,-1.037843,-0.951297,0.800583,36.735726
38,"New Orleans-Metairie, LA",-1.122005,-0.700897,-0.67576,-0.416524,-0.941688,0.528687,36.974665
3,"Baltimore-Columbia-Towson, MD",1.536576,0.54433,-0.043636,-0.139869,-2.176723,0.528687,37.266901


In [10]:
print("Minimum score: ", min(features_normalized['Scores']))
print("Maximum score: ", max(features_normalized['Scores']))

Minimum score:  0.0
Maximum score:  100.0


In [15]:
data.sort_values('Violent crime rate per 100,000')

Unnamed: 0.1,Unnamed: 0,2013 CBSA,CBSA Code,Population,Age Adjusted Accidental Death Rate per 100000,Median AQI,Estimate; Total,Year,Smoking rate,Binge drinking rate,"Property crime rate per 100,000","Violent crime rate per 100,000"
47,47,"Provo-Orem, UT",39340.0,8716146.0,35.2,36,617678,2017,4.2,4.77,1530.0,79.8
45,45,"Portland-South Portland, ME",38860.0,9189623.0,37.9,43,532083,2017,13.7,18.73,1476.9,127.4
39,39,"Ogden-Clearfield, UT",36260.0,10172408.0,35.7,44,666547,2017,9.51,10.73,2095.7,162.3
14,14,"Duluth, MN-WI",20260.0,5004136.0,44.7,39,278782,2017,21.54,23.52,3007.3,201.8
24,24,"Kennewick-Richland, WA",28420.0,4251915.0,34.9,22,290296,2015,13.52,18.92,2045.4,205.0
6,6,"Cedar Rapids, IA",16300.0,4545321.0,35.8,43,270293,2017,18.08,22.91,2410.0,217.4
31,31,"Madison, WI",31540.0,10617266.0,39.0,35,654230,2014,11.33,23.54,1826.0,228.6
49,49,"Roanoke, VA",40220.0,5436453.0,42.0,6,312688,2014,22.56,14.49,2389.5,229.8
16,16,"Fort Collins, CO",22660.0,5225574.0,37.4,46,343976,2012,15.74,17.99,2242.9,239.8
19,19,"Greeley, CO",24540.0,4258691.0,45.3,44,304633,2012,20.56,17.77,1739.3,256.3
