In [53]:
import numpy as np
import pandas as pd
import json
from shapely.geometry import shape, Point



Unnamed: 0,INCIDENT_TYPE_DESCRIPTION,FROMDATE,WEAPONTYPE,Shooting,DOMESTIC,Year,Month,DAY_WEEK,Location
0,RESIDENTIAL BURGLARY,07/08/2012 06:00:00 AM,Other,No,No,2012,7,Sunday,"(42.34638135, -71.10379454)"
1,AGGRAVATED ASSAULT,07/08/2012 06:03:00 AM,Firearm,Yes,No,2012,7,Sunday,"(42.31684135, -71.07458456)"
2,ROBBERY,07/08/2012 06:26:00 AM,Firearm,No,No,2012,7,Sunday,"(42.34284135, -71.09698955)"
3,COMMERCIAL BURGLARY,07/08/2012 06:56:00 AM,Other,No,No,2012,7,Sunday,"(42.3164411, -71.06582908)"
4,ROBBERY,07/08/2012 07:15:00 AM,Firearm,No,No,2012,7,Sunday,"(42.27051636, -71.11989955)"
5,ROBBERY,07/08/2012 07:32:00 AM,Firearm,Yes,No,2012,7,Sunday,"(42.31328183, -71.0530059)"
6,ROBBERY,07/08/2012 07:50:00 AM,Firearm,No,No,2012,7,Sunday,"(42.32425136, -71.08620956)"
7,SIMPLE ASSAULT,07/08/2012 07:50:00 AM,Unarmed,No,No,2012,7,Sunday,"(42.34924634, -71.06378456)"
8,MedAssist,07/08/2012 07:53:00 AM,Unarmed,No,No,2012,7,Sunday,"(42.35174635, -71.16590953)"
9,MedAssist,07/08/2012 08:05:00 AM,Unarmed,No,No,2012,7,Sunday,"(42.25938275, -71.11729354)"


In [169]:
# somerville happiness data
happy = pd.read_csv('somerville_happy.csv')
happy.columns = [
            'ID', 'Year', 'Current Happy',
           'Overall Life Satisfaction',
           'Somerville Satisfaction',
           'Individual Similarity To Acquaitances 2011',
           'Rely on advice or self for decision making 2011',
           'Neighborhood satisfaction',
           'Proud to be Somerville resident 2015',
           'City Services Availability Rating 2015',
           'Availability of affordable housing 2011',
           'Cost of Housing Rating',
           'Rate cost of public schools 2011',
           'Rate overall cost of public schools 2011',
           'Rate beauty or physical setting 2011',
           'Rate beauty or physical setting 2013',
           'Rate effectiveness of local police 2011_2013',
           'Trust in local police 2015',
           'Rate maintenance of streets, sidewalks and squares 2013',
           'Rate maintenance of streets, sidewalks and squares 2015',
           'Availability of social communities and events',
           'Safety walking in neighborhood at night 2013',
           'Safety walking in community at night 2015',
           'Rate beauty or physical setting of neighborhood 2015',
           'Satisfaction with appearance of parks and squares 2013',
           'Satisfaction with local parks and squares',
           'Gender', 'Gender 2011', 'Age',
           'marital status 2011',
           'Household languages (non-english) 2015',
           'Ethnicity 2011_2013',
           'Is Hispanic 2013',
           'Ethnicity 2015',
           'Lives with Minors',
           'Housing Status',
           'Plans to leave Somerville within 2 years',
           'Years Lived in Somerville',
           'Annual Household Income',
           'Neighborhood', 'Is Student', 'Ward',
           'Precinct'
    ]

def normalize_income(income):
    if income == 'Less than $10,000':
        return income
    
    if income in ['$25,000 to $49,999', '40,000 - $49,999',
                  '$10,000 to $24,999', '30,000 - $39,999',
                  '20,000 - $29,999', '10,000 - $19,999'
                  ]:
        return '<50k'
    elif income in ['50,000 - $59,999', '60,000 - $69,999',
                   '70,000 - $79,999', '80,000 - $89,999',
                   '90,000 - $99,999', '$50,000 to $74,999',
                   '$75,000 to $99,999']:
        return '50k-100k'
    else:
        return '>100k'
    
def norm_eth(eth):
    ethnicity = eth.split(',')[0].split('/')[0].strip()
    if 'white' in ethnicity or 'asian' in ethnicity:
        if 'black' in ethnicity or 'African-American' in ethnicity:
            return 'black'
        return 'white/asian'
    elif 'Black' in ethnicity or 'African-American' in ethnicity:
        return 'Black'
    elif 'R' == ethnicity:
        return None
    elif 'American Indian' == ethnicity:
        return 'Native American'
    else:
        return ethnicity

    

happy['Income Bracket'] = happy['Annual Household Income'].apply(normalize_income)
happy['ETHNICITY'] = (happy[['Ethnicity 2011_2013', 'Ethnicity 2015']]
                      .fillna('')
                      .sum(axis=1)
                      .apply(norm_eth))



def get_income_data():
    incomes = pd.DataFrame(happy[happy['Year'] == 2011]['Income Bracket'].value_counts())
    incomes['2015'] = happy[happy['Year'] == 2015]['Income Bracket'].value_counts().sort_index()
    incomes.columns = ['2011', '2015']
    incomes['2011 %'] = incomes['2011'].apply(lambda x: x/float(incomes['2011'].sum()))
    incomes['2015 %'] = incomes['2015'].apply(lambda x: x/float(incomes['2015'].sum()))
    return incomes

def get_happy_data(year, income_bracket):
    # happiness data
    # happy[(happy['Year'] == 2015) & (happy['Income Bracket'] == '<50k')]['Current Happy'].mean()
    current_happy = (happy
        .where((happy['Year'] == year) & (happy['Income Bracket'] == income_bracket))
        ['Overall Life Satisfaction']
        .value_counts()
        .sort_index()
        .reset_index())
    current_happy['index'] = pd.to_numeric(current_happy['index'], 
                                           errors='coerce')
    current_happy['Overall Life Satisfaction'] = pd.to_numeric(current_happy['Overall Life Satisfaction'], 
                                           errors='coerce')

    current_happy = (current_happy
     .dropna()
     .where(current_happy['index'] < 11)
     .dropna(how='any')
     .sort_index())

    current_happy['Less than 6'] = current_happy['index'].apply(lambda x: x < 6)
    current_happy.groupby('Less than 6').sum()
    
writer = pd.ExcelWriter('happy_cleaned.xls')
happy.to_excel(writer, 'Sheet1')
get_income_data().to_excel(writer, 'Sheet2')
writer.save()
    




In [197]:
def get_age_bracket(age):
    if age < 13 or age > 80:
        return None
    if age < 18:
        return '<18'
    if age < 30:
        return '18 - 30'
    
    return ' 30+'
        

fio = pd.read_csv('bpd_fio.csv')

# init columns
fio['FRISKED'] = fio['OUTCOME'].astype(str).apply(lambda x: x.find('F') != -1)
fio['SEIZED'] = fio['OUTCOME'].astype(str).apply(lambda x: x.find('S') != -1)
fio['AGE_BRACKET'] = fio['AGE_AT_FIO_CORRECTED'].apply(get_age_bracket)
fio['YEAR'] = fio['FIO_DATE'].apply(lambda x: x[6:10])
fio['MONTH'] = fio['FIO_DATE'].apply(lambda x: x[0:2])
fio['DAY'] = fio['FIO_DATE'].apply(lambda x: x[3:5])

#total stop breakdown
fio[fio['YEAR'] == '2015']['CITY'].value_counts()
# % black
# fio.RACE_DESC.value_counts() / len(fio.index)

# # % black by neighborhood
# (fio.groupby(['CITY', 'RACE_DESC']).size() / fio.groupby(['CITY']).size()).sort_values(ascending=False)

# # city-wide makeup w/ no priors
# no_priors = fio[fio['PRIORS'] == 'NO']
# no_priors.RACE_DESC.value_counts() / len(no_priors.index)
# # neighborhood dist with no priors
# (no_priors.groupby(['CITY', 'RACE_DESC']).size() / no_priors.groupby(['CITY']).size())
# # city dist w/ priors
# has_priors = fio[fio['PRIORS'] == 'YES']
# has_priors.RACE_DESC.value_counts() / len(has_priors.index)
# # stop reasons
# (fio.groupby(['RACE_DESC', 'BASIS']).size() / fio.groupby('RACE_DESC').size())
# # terrorism
# t = fio[fio.TERRORISM == 'YES']
# t.groupby(['RACE_DESC', 'STOP_REASONS']).size()
# # % w/ priors by race *** black people that were stopped more likely to have priors
# (fio.groupby(['RACE_DESC', 'PRIORS']).size() / fio.groupby('RACE_DESC').size())
# # age distribution - median age is 25
# fio.AGE_AT_FIO_CORRECTED.median()
# # median age by race - blacks have lowest median age (24) & whites have the highest (31)
# fio[(~fio['AGE_BRACKET'].isnull()) & (fio['YEAR'] == '2015')].groupby('RACE_DESC').agg('median')['AGE_AT_FIO_CORRECTED']

NO DATA ENTERED    4808
Boston             1315
Dorchester          950
OTHER               701
Roxbury             479
Hyde Park           216
East Boston         211
South Boston        207
Mattapan            167
Roslindale          154
Jamaica Plain       148
Brighton             55
West Roxbury         40
Charlestown          22
South End            10
Allston               9
Downtown              1
Beacon Hill           1
Name: CITY, dtype: int64

In [193]:
# crime_data = pd.read_csv('crime_july2012-august2015.csv', dtype='unicode')

# with open('site/public/json/neighborhoods.json') as f:
#     js = json.load(f)

# polygons = [shape(feature['geometry']) for feature in js['features']]
# def get_neighborhood(x):
#     lat, lon, count = x[0], x[1], x[2]
#     if count % 500 == 0:
#         print(count)
#     point = Point(lon, lat)
#         # check each polygon to see if it contains the point
#     for i, polygon in enumerate(polygons):
#         if polygon.contains(point):
#             return js['features'][i]['properties']['Name']
        
#     return None
            
# crime_data['location_lat'] = crime_data['Location'].apply(lambda x: float(x.split(',')[0][1:]))
# crime_data['location_lon'] = crime_data['Location'].apply(lambda x: float(x.split(',')[-1][:-1]))
# crime_data['count'] = crime_data.reset_index().index
# crime_data['Neighborhood'] = crime_data[['location_lat', 'location_lon', 'count']].apply(get_neighborhood, axis=1)
# crime_data.columns[:]
shootings_2015 = crime_data[(crime_data['Year'] == '2015') & (crime_data['Shooting'] == 'Yes')]
shootings_2015 = pd.DataFrame(shootings_2015.Neighborhood.value_counts()).sort_index()
shootings_2015 = shootings_2015.reset_index()
shootings_2015.columns = ['Neighborhood', 'Shootings2015']
shootings_2015 = shootings_2015.set_index('Neighborhood')
shootings_2015

    

Unnamed: 0_level_0,Shootings2015
Neighborhood,Unnamed: 1_level_1
Back Bay,1
Dorchester,41
Downtown,1
East Boston,2
Hyde Park,5
Jamaica Plain,6
Longwood Medical Area,2
Mattapan,8
Mission Hill,3
Roslindale,3


In [183]:

def is_violent(row):
    desc, weapon = row
    if 'Assault' in desc or 'ASSAULT' in desc:
        return True
    elif weapon not in ['None', 'Unarmed', 'Other']:
        return True
    return False
        
crime_data['violent'] = crime_data[['INCIDENT_TYPE_DESCRIPTION', 'WEAPONTYPE']].apply(is_violent, axis=1)
crime_data[(crime_data['Year'] == '2015') & (crime_data['Neighborhood'] == 'Back Bay') & (crime_data['violent'] == True)]

crime_data.to_csv('crime_data_cleaned.csv', sep='@')

In [264]:
crime_data_2015 = pd.read_csv('crime_data_cleaned.csv', sep='@').where(crime_data['Year'] == '2015')
violent_crimes = (pd.DataFrame(crime_data_2015[crime_data_2015['violent'] == True]
                              .Neighborhood
                              .value_counts())
                            .reset_index())


# violent_crimes.columns = ['Neighborhood', 'Violent Crimes']
# violent_crimes = violent_crimes.set_index('Neighborhood').sort_index()
neighborhoods = ['Roxbury', 'Brighton', 'Fenway']
hood_cmp = crime_data_2015[(crime_data_2015['violent'] == True) & (crime_data_2015['Neighborhood'].isin(neighborhoods))]

hood_cmp['date_str'] = hood_cmp['FROMDATE'].astype(str).apply(lambda x: x[0:10])



hood_cmp = hood_cmp[['Neighborhood', 'date_str', 'violent']].groupby(['Neighborhood', 'date_str']).count()
hood_cmp = hood_cmp.reset_index()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0                            NaN
1                            NaN
2                            NaN
3                            NaN
4                            NaN
5                            NaN
6                            NaN
7                            NaN
8                            NaN
9                            NaN
10                           NaN
11                           NaN
12                           NaN
13                           NaN
14                           NaN
15                           NaN
16                           NaN
17                           NaN
18                           NaN
19                           NaN
20                           NaN
21                           NaN
22                           NaN
23                           NaN
24                           NaN
25                           NaN
26                           NaN
27                           NaN
28                           NaN
29                           NaN
          

In [260]:
# hood_cmp.to_csv('site/public/csv/neighborhood_crime_cmp.csv')


<pandas.core.indexing._LocIndexer at 0x7f3c6497c7d0>

In [230]:
census = pd.read_csv('ma_census_2015.csv', dtype='unicode', skiprows=1)
tract_data = pd.read_csv('tract_data.csv', delimiter=' ')
# pop = pd.read_csv('pop_total.csv', dtype='unicode', skiprows=1)
race = pd.read_csv('race_hispanic.csv', skiprows=1)


def get_tract_id(label):
    s = label.split(',')
    return s[0][12:]

def wavg(group, avg_name, weight_name):
    """ 
    http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns
    """
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()

def pct(group, num):
    n = group[num]
    d = group['pop']
    return n.sum() / d.sum()
    
census['tract'] = census['Geography'].apply(get_tract_id)
census = (census
          .rename(columns = {'Id2':'GEOID'})
          .set_index('GEOID'))

# population data
# pop = (pop
#           .rename(columns = {'Id2':'GEOID'})
#           .set_index('GEOID'))
# pop = pd.DataFrame(pop['Estimate; Total'])

#race data
race = (race
          .rename(columns = {'Id2':'GEOID'})
          .set_index('GEOID'))

race.columns
race = race[[
        'Estimate; Total:',
       u'Estimate; Not Hispanic or Latino: - White alone',
       u'Estimate; Not Hispanic or Latino: - Black or African American alone',
       u'Estimate; Not Hispanic or Latino: - American Indian and Alaska Native alone',
       u'Estimate; Not Hispanic or Latino: - Asian alone',
       u'Estimate; Not Hispanic or Latino: - Native Hawaiian and Other Pacific Islander alone',
       u'Estimate; Not Hispanic or Latino: - Some other race alone',
       u'Estimate; Not Hispanic or Latino: - Two or more races:',
         'Estimate; Hispanic or Latino:'
    ]]

race_columns = ['pop', 'White', 'Black', 'AmericanIndianNativeAlaskan', 'Asian', 'NativeHawaiianPacificIslander',
                'OtherRace', 'TwoOrMoreRaces', 'HispanicLatino']
race.columns = race_columns

race = race.set_index(race.index.astype(str))

# census tract -> neighhborhood map
tract_data['GEOID'] = tract_data['GEOID'].astype(str)
tract_data = tract_data.set_index('GEOID')

# join above
census = census.join(tract_data, how='inner')
census
census = census[['Neighborhood', 'tract',
                 'Percent; EMPLOYMENT STATUS - Population 16 years and over - In labor force - Civilian labor force - Unemployed', 
                 'Percent; PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL - All families'
                ]]
census.columns = ['Neighborhood', 'tract', 'unemployment rate', 'percent families below poverty level']
census = census.join(race, how='inner').replace('-', np.nan)

census[['pop', 'unemployment rate', 'percent families below poverty level']] = census[['pop', 'unemployment rate', 'percent families below poverty level']].astype(float)

# weighted avg percents to get neighborhood percents
employment_by_neighborhood = census.groupby('Neighborhood').apply(wavg, 'unemployment rate', 'pop')
poverty_by_neighborhood = census.groupby('Neighborhood').apply(wavg, 'percent families below poverty level', 'pop')


# calculate percentage

avgs = pd.DataFrame(employment_by_neighborhood, columns=['unemploymentPct'])
avgs['povertyPct'] = poverty_by_neighborhood
avgs['pop'] = census.groupby('Neighborhood').sum()['pop']
avgs = avgs.reset_index()
avgs['Neighborhood'] = avgs['Neighborhood'].apply(lambda x: x.replace('_', ' '))
avgs = avgs.set_index('Neighborhood').sort_index()

avgs = avgs.join(violent_crimes, how='inner')
avgs['violentCrimePerCapita'] = avgs['Violent Crimes'] / avgs['pop']
avgs['violentCrimesPerThousandPpl'] = avgs['violentCrimePerCapita'] * 1000
avgs['shootings'] = shootings_2015['Shootings2015']


race_totals = census.groupby('Neighborhood').sum()[race_columns].reset_index()
race_totals['Neighborhood'] = race_totals['Neighborhood'].apply(lambda x: x.replace('_', ' '))
race_totals = race_totals.set_index('Neighborhood').sort_index()

for race in race_columns:
    if race == 'pop':
        continue
    col_name = race.lower() + 'Pct'
    avgs[col_name] = (race_totals[race] / race_totals['pop']) * 100

ranked_cols = ['unemploymentPct', 'povertyPct', 'violentCrimePerCapita']

for col in ranked_cols:
    avgs["{}Rank".format(col)] = avgs[col].rank(ascending=False)
  

# avgs.to_csv('site/public/csv/averages_2015.csv')

neighborhood_stats = avgs[['unemploymentPct', 'povertyPct', 'pop', 'violentCrimesPerThousandPpl', 'shootings']]
neighborhood_stats.columns = ['unemploymentPct', 'povertyPct', 'pop', 'vCrime1000', 'shootings']

neighborhood_stats = neighborhood_stats.fillna(value=0).transpose().to_dict()


Unnamed: 0_level_0,unemploymentPct,povertyPct,pop,Violent Crimes,violentCrimePerCapita,violentCrimesPerThousandPpl,shootings,whitePct,blackPct,americanindiannativealaskanPct,asianPct,nativehawaiianpacificislanderPct,otherracePct,twoormoreracesPct,hispaniclatinoPct,unemploymentPctRank,povertyPctRank,violentCrimePerCapitaRank
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Allston,5.044062,15.478766,19761.0,125,0.006326,6.325591,,57.274429,5.804362,0.065786,16.841253,0.0,0.921006,2.945195,16.147968,9.0,9.0,7.0
Back Bay,2.515839,5.51754,17577.0,136,0.007737,7.737384,1.0,76.418046,4.613984,0.07965,9.13694,0.0,0.364112,1.268703,8.118564,20.0,19.0,5.0
Beacon Hill,3.016497,2.129113,9305.0,27,0.002902,2.901666,,87.243418,1.171413,0.0,5.792585,0.0,0.214938,1.429339,4.148307,19.0,22.0,18.0
Brighton,3.669086,14.905479,47768.0,109,0.002282,2.281862,,66.877407,4.084324,0.301457,15.340814,0.050243,0.724334,2.18975,10.43167,16.0,10.0,20.0
Charlestown,4.702719,21.56226,18058.0,84,0.004652,4.651678,,69.636726,8.622217,0.0,8.738509,0.0,0.465168,1.57271,10.964669,12.0,3.0,15.0
Dorchester,8.755388,20.177067,124489.0,1188,0.009543,9.543012,41.0,22.073436,44.142053,0.399232,10.608969,0.021689,3.94573,2.478131,16.33076,2.0,4.0,3.0
East Boston,6.78082,16.34861,44989.0,232,0.005157,5.156816,2.0,32.007824,2.367245,0.062237,3.585321,0.0,0.302296,3.743137,57.931939,5.0,8.0,10.0
Fenway,5.506101,18.82882,32210.0,132,0.004098,4.098106,,61.400186,5.02018,0.149022,18.674325,0.043465,0.381869,2.877988,11.452965,7.0,6.0,16.0
Hyde Park,6.351196,8.96349,35585.0,169,0.004749,4.749192,5.0,25.926654,45.117325,0.115217,2.096389,0.0,0.986371,2.222847,23.535197,6.0,15.0,12.0
Jamaica Plain,4.697646,13.495486,39207.0,188,0.004795,4.795062,6.0,54.21991,11.610172,0.0,5.521973,0.0,0.566225,2.629632,25.452088,13.0,11.0,11.0


In [227]:
# combine dataframe and geojson
path_to_geojson = 'site/public/json/neighborhoods.json'
with open(path_to_geojson) as f:
    js = json.load(f)
    for neighborhood in js['features']:
        props = neighborhood['properties']
        name = props['Name']
        stats = neighborhood_stats[name] if name in neighborhood_stats else {}
        for k,v in stats.iteritems():
            if k == 'pop':
                v = int(v)
            elif type(v) == np.float64:
                v = np.round(v,1)
            print("adding {}=>{} to {}".format(k,v,name))
            props[k] = v

with open(path_to_geojson, 'w') as f:          
    json.dump(js, f)

        

adding shootings=>3.0 to Roslindale
adding unemploymentPct=>6.8 to Roslindale
adding pop=>28644 to Roslindale
adding povertyPct=>12.0 to Roslindale
adding vCrime1000=>3.7 to Roslindale
adding shootings=>6.0 to Jamaica Plain
adding unemploymentPct=>4.7 to Jamaica Plain
adding pop=>39207 to Jamaica Plain
adding povertyPct=>13.5 to Jamaica Plain
adding vCrime1000=>4.8 to Jamaica Plain
adding shootings=>3.0 to Mission Hill
adding unemploymentPct=>4.8 to Mission Hill
adding pop=>16700 to Mission Hill
adding povertyPct=>31.1 to Mission Hill
adding vCrime1000=>4.7 to Mission Hill
adding shootings=>2.0 to Longwood Medical Area
adding unemploymentPct=>5.5 to Longwood Medical Area
adding pop=>5266 to Longwood Medical Area
adding povertyPct=>8.2 to Longwood Medical Area
adding vCrime1000=>5.3 to Longwood Medical Area
adding shootings=>0.0 to Leather District
adding unemploymentPct=>3.8 to Leather District
adding pop=>5668 to Leather District
adding povertyPct=>8.7 to Leather District
adding vCrim