In [23]:
# SET UP
import pandas as pd
import altair as alt
import folium, os
pd.set_option('display.max_columns', None)

## what years do you want to look back over (what is the 'lookback period')?
years = ['2016', '2017', '2018', '2019']
min_year = min(years) # <- this is only used for chart titles later on
years = pd.DataFrame(years)
years.columns = ['YEAR']


# ASSESSMENT DATA
## read in raw csv
asmt = pd.read_csv('/home/dan/Python/QueenCityCounts/llrd_code/data/2019-2020_Assessment_Roll.csv', dtype=object)
### TODO: ADD DROP ROW WITH EMPTY, 'NA' OR 'UNKNOWN' NEIGHBORHOOD OR ADDRESS ###
## drop unnecessary columns, remove duplicate rows
asmt = asmt[['PRINT KEY','PROPERTY CLASS','NEIGHBORHOOD', 'HOUSE NUMBER', 'STREET']].drop_duplicates()
## rename column headers
asmt.rename(columns={'PRINT KEY':'SBL','PROPERTY CLASS':'PROP_TYPE',\
                     'NEIGHBORHOOD':'NBHD', 'HOUSE NUMBER':'NUMBER'},inplace=True)
## concatenate 'NUMBER' and 'STREET' columns into one column
asmt['ADDRESS'] = asmt[['NUMBER','STREET']].apply(lambda x: ' '.join(x.values.astype(str)),axis=1)


# CODE VIOLATIONS DATA
## read in raw csv
vios = pd.read_csv('/home/dan/Python/QueenCityCounts/llrd_code/data/Code_Violations.csv', dtype=object)
### TODO: ADD DROP ROW WITH EMPTY, 'NA' OR 'UNKNOWN' ADDRESS ###
## drop unnecessary columns, remove duplicate rows
vios = vios[['DATE', 'UNIQUEKEY', 'ADDRESS']].drop_duplicates()
## format the 'DATE' column, add a 'YEAR' column
vios['DATE'] = vios['DATE'].apply(lambda x: str(x).split(' ')[0])
vios['DATE'] = pd.to_datetime(vios['DATE'])
vios['YEAR'] = vios['DATE'].apply(lambda x: str(x.year))
# split the 'ADDRESS' column into a 'NUMBER' and a 'STREET' column
vios['NUMBER'] = vios['ADDRESS'].apply(lambda x: str(x).split(' ')[0])
vios['STREET'] = vios['ADDRESS'].apply(lambda x: ' '.join(str(x).split(' ')[1:]))
## get a count of total violations (per address, per year)
vios.rename(columns={'UNIQUEKEY':'VIOLATIONS'},inplace=True)
vios = pd.pivot_table(vios, index=['NUMBER', 'STREET', 'YEAR'], values=['VIOLATIONS'], aggfunc='count')
vios.reset_index(inplace=True)


## RENTAL PROPERTY DATA
## read in raw csv
rent = pd.read_csv('/home/dan/Python/QueenCityCounts/llrd_code/data/Rental_Registry.csv', dtype=object)
### TODO: DROP ROWS WITH EMPTY, 'NA' OR 'UNKNOWN' SBL
## drop unnecessary columns, remove duplicate rows
rent = rent[['Print Key', 'Address', 'License Status', 'Issued Datetime', 'Expiration Datetime']].drop_duplicates()
## only procede with 'Active' records
rent = rent[rent['License Status']=='Active']
## concatenate 'NUMBER' and 'STREET' columns into one column
rent['NUMBER'] = rent['Address'].apply(lambda x: str(x).split(' ')[0])
rent['STREET'] = rent['Address'].apply(lambda x: ' '.join(str(x).split(' ')[1:]))
## rename column headers
rent.rename(columns={'License Status':'STATUS','Issued Datetime':'ISSUED',\
                     'Expiration Datetime':'EXPIRES','Print Key':'SBL','Address':'ADDRESS'}, inplace=True)
## change date columns from string to datetime datatypes
rent['ISSUED'] = pd.to_datetime(rent['ISSUED'])
rent['EXPIRES'] = pd.to_datetime(rent['EXPIRES'])
## create flag column
rent['IS_RENTAL'] = int(1)


# JOIN RENTAL REGISTRY AND VIOLATIONS
# IMPORTANT: OpenDataBuffalo reports Rental Status is by SBL, but Code Violations are reported by address
years = years.assign(key=1)
asmt = asmt.assign(key=1)
## repeat asmt dataframe for each year
df = asmt.merge(years, on='key',how='inner').drop(columns=['key','ADDRESS'])
## asmt <- rental (on sbl as key)
## note: this assumes every residence on the rental registry has been always been a rental,
## back to the start of the analysis period, since rental registry is not a panel dataset
df = df.merge(rent[['SBL', 'IS_RENTAL']].drop_duplicates(), on='SBL', how='left')
df['IS_RENTAL'].fillna(0, inplace=True)
## asmt+rental <- vios (on number, street, and year, as keys)
df = df.merge(vios, on=['NUMBER','STREET','YEAR'], how='left')
df['VIOLATIONS'].fillna(0, inplace=True)
## sort dataframe (which now is asmt*year+rental+viols) and reset index
df = df.sort_values(['YEAR','NUMBER','STREET'], ascending=True).reset_index(drop=True)


# AGGREGATE BY NEIGHBORHOOD
## filter by 'PROP_TYPE'
pt = df[(df['PROP_TYPE'].apply(lambda x: x[0])=='2')]
## drop unnecessary columns, remove duplicate rows
pt = pt[['SBL','NBHD','IS_RENTAL','VIOLATIONS']].drop_duplicates()
## for each sbl, are they on the rental registry, and have they had any citations during the lookback period 
pt = pd.pivot_table(pt, index=['NBHD', 'SBL'], values=['VIOLATIONS','IS_RENTAL'], aggfunc={'IS_RENTAL':sum,'VIOLATIONS':sum}).reset_index()
pt['IS_RENTAL'] = pt['IS_RENTAL'].apply(lambda x: 'NO' if x==0.0 else 'YES')
pt['VIOLATIONS'] = pt['VIOLATIONS'].apply(lambda x: 'NO' if x==0.0 else 'YES')
## for each neighborhood, count sbls by rental/nonrental, and vios/no-vios 
pt = pd.pivot_table(pt, index=['NBHD', 'IS_RENTAL'], columns=['VIOLATIONS'], values=['SBL'], aggfunc={'SBL':len}).reset_index()
## get as percents
pt['NO_VIOS'] = pt['SBL']['NO']/(pt['SBL']['NO'] + pt['SBL']['YES'])
pt['VIOS'] = pt['SBL']['YES']/(pt['SBL']['NO'] + pt['SBL']['YES'])
## reset column indexs headers
pt = pt.T.reset_index(drop=True).T
## rename column headers
pt.rename(columns={0:'NBHD',1:'IS_RENTAL',2:'NO_VIOS_RAW',3:'VIOS_RAW',4:'NO_VIOS_PCT',5:'VIOS_PCT'},inplace=True)
## calculate totals
pt.fillna(0, inplace=True)
pt['TOTAL'] = pt['NO_VIOS_RAW'] + pt['VIOS_RAW']
pt['TOTAL'] = pt['TOTAL'].apply(int)
pt

Unnamed: 0,NBHD,IS_RENTAL,NO_VIOS_RAW,VIOS_RAW,NO_VIOS_PCT,VIOS_PCT,TOTAL
0,Allentown,NO,486.0,22.0,0.956693,0.043307,508
1,Allentown,YES,99.0,6.0,0.942857,0.057143,105
2,Black Rock,NO,457.0,94.0,0.829401,0.170599,551
3,Black Rock,YES,210.0,160.0,0.567568,0.432432,370
4,Broadway Fillmore,NO,1678.0,360.0,0.823356,0.176644,2038
...,...,...,...,...,...,...,...
67,Upper West Side,YES,505.0,367.0,0.579128,0.420872,872
68,West Hertel,NO,597.0,64.0,0.903177,0.096823,661
69,West Hertel,YES,153.0,68.0,0.692308,0.307692,221
70,West Side,NO,1019.0,268.0,0.791764,0.208236,1287


In [25]:
# CHARTS
## date prep
## drop unnecessary columns, and wide to long existing pt
alt_pt = pt.drop(columns=['VIOS_RAW','NO_VIOS_RAW']).melt(id_vars=['NBHD','IS_RENTAL','TOTAL'])
## recast rental flag to 'RENT' for rental property and 'OO' for owner-occupied
alt_pt['TYPE'] = alt_pt['IS_RENTAL'].apply(lambda x: 'RENT' if x=='YES' else 'OO')
alt_pt.drop(columns=['IS_RENTAL'],inplace=True)
## rename column headers
alt_pt.rename(columns={'value':'PERCENT','variable':'STATUS'},inplace=True)
## nans to zeros
alt_pt.fillna(0, inplace=True)

## iterate through prepared data, and make charts
charts = {}
for nbhd in alt_pt['NBHD'].unique():
    tmp = alt_pt[(alt_pt['NBHD']==nbhd) & (alt_pt['STATUS']=='VIOS_PCT')]
    title = [nbhd, ('OO: ' + str(format(int(tmp[(tmp['NBHD']==nbhd) & (tmp['TYPE']=='OO')]['TOTAL']),',d'))\
                + ' | RENT: ' + str(format(int(tmp[(tmp['NBHD']==nbhd) & (tmp['TYPE']=='RENT')]['TOTAL']),',d')))]
    chart = alt.Chart(tmp[['NBHD','STATUS','TYPE','PERCENT']], title = title).mark_bar().encode(
        x=alt.X('TYPE', sort=['OO','RENT'], title = 'type of residence'),
        y=alt.Y('PERCENT:Q', axis=alt.Axis(format='%', title = '% that received a citation since ' + min_year), scale=alt.Scale(domain=(0, 1))),
        color=alt.Color('STATUS:N', legend = None,
                        scale=alt.Scale(
                            domain=['NO_VIOS_PCT','VIOS_PCT'],
                            range=['lightblue','red'])),
        #column=alt.Column('NBHD:N', title = title,),
        order=alt.Order('TYPE')
    )
    charts.update({nbhd:chart})

## read out charts dictionary for html outfile
((charts['Allentown'] | charts['Black Rock'] | charts['Broadway Fillmore'] | charts['Central'] | charts['Central Park']) &
(charts['Delavan Grider'] | charts['Ellicott'] | charts['Elmwood Bidwell'] | charts['Elmwood Bryant'] | charts['Fillmore-Leroy']) &
(charts['First Ward'] | charts['Fruit Belt'] | charts['Genesee-Moselle'] | charts['Grant-Amherst'] | charts['Hamlin Park']) &
(charts['Hopkins-Tifft'] | charts['Kaisertown'] | charts['Kenfield'] | charts['Kensington-Bailey'] | charts['Lovejoy']) &
(charts['Lower West Side'] | charts['Masten Park'] | charts['MLK Park'] | charts['North Park'] | charts['Parkside']) &
(charts['Pratt-Willert'] | charts['Riverside'] | charts['Schiller Park'] | charts['Seneca Babcock'] | charts['Seneca-Cazenovia']) &
(charts['South Park'] | charts['University Heights'] | charts['Upper West Side'] | charts['West Hertel'] | charts['West Side'])).save('/home/dan/Python/QueenCityCounts/llrd_code/charts.html')

In [114]:
# MAP
## find geojson file
geojson = os.path.join('data','Neighborhoods.geojson')
## initialize map
m = folium.Map([42.900155, -78.8485], zoom_start=12)
## overlay geojson to map object
folium.GeoJson(geojson, \
               tooltip = folium.GeoJsonTooltip(fields=['nbhdname'],labels = False)\
              ).add_to(m)
## outfile to html
m.save('/home/dan/Python/QueenCityCounts/llrd_code/map.html')