# Cleaning Campaign Finance Data

In [30]:
import geocoder
import pandas as pd

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [35]:
filename_campaign_fin = '~/git/greenburgh-town-supervisor-2019/src/data/campaign_finance/Opponent Campaign Contributions.xlsx'
output_filename_campaign_fin = '~/git/greenburgh-town-supervisor-2019/output/campaign_finance/campaign_contributions_cleaned.csv'

In [34]:
def geocode(address):
    """
    Geocode using Open Street Maps taking in Address as an argument
    """
    print(address)
    g = geocoder.osm(address)
    if g.status == 'ERROR - No results found':
        return None
    else:
        return (g.json['lat'], g.json['lng'])

In [10]:
df = pd.read_excel(filename_campaign_fin)

In [11]:
df.head()

Unnamed: 0,contributor,amount,contribution_date,report,schedule,year,schedule_description,company,relevant_links,comments
0,"CANNON, DONALD W\n40 PINERIDGE ROAD\nWHITE PLA...",2000.0,2019-06-21,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,Another Nine LLC (CFO),https://www.mylife.com/donald-cannon/e98303177148,
1,"GUGGENHEIMER, PETER \n15 CHEDWORTH ROAD\nSCARS...",1000.0,2019-06-19,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,Guggenheimer Architects,https://www.mylife.com/peter-guggenheimer/e340...,
2,"GRALLA, YVETTE \n1017 SAW MILL RIVER ROAD\nARD...",1000.0,2019-06-17,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,,,
3,", CONSTRUCTION INDUSTRY COUNCIL NYS PAC\n629 O...",300.0,2019-06-24,2019 10 Post Primary,C,2019,Monetary Contributions/All Other,,,
4,"HERMANN, ROBERT \n46 TOPLAND RD\nHARTSDALE, NY...",250.0,2019-06-13,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,,https://www.mylife.com/robert-hermann/e3124970...,


In [50]:
x = df.contributor[0]

In [51]:
x.split('\n')

['CANNON, DONALD W', '40 PINERIDGE ROAD', 'WHITE PLAINS, NY 10603']

In [52]:
# Street Address
x.split('\n')[1]

'40 PINERIDGE ROAD'

In [53]:
# City
x.split('\n')[2].split(',')[0]

'WHITE PLAINS'

In [54]:
# State
x.split('\n')[2].split(',')[1].strip().split(' ')[0]

'NY'

In [55]:
# Zip
x.split('\n')[2].split(',')[1].strip().split(' ')[1]

'10603'

## Cleaning Up Address Field

In [70]:
df['address'] = df.contributor.apply(lambda x: ' '.join(x.split('\n')[1:]))

In [71]:
df.address.value_counts(dropna=False).head()

 ,                                       30
120 CARTHAGE ROAD SCARSDALE, NY 10583    10
317 ARDSLEY ROAD SCARSDALE, NY 10583     10
70 OLD ARMY ROAD SCARSDALE, NY 10583      9
55 GRASSLANDS ROAD VALHALLA, NY 10595     8
Name: address, dtype: int64

### Extracting Street Address

In [72]:
df['street_address'] = df.contributor.apply(lambda x: x.split('\n')[1] if(len(x.split('\n')) > 2) else None)

### Extracting City

In [73]:
df['city'] = df.contributor.apply(lambda x: x.split('\n')[2].split(',')[0] if(len(x.split('\n')) > 2) else None)

### Extracting State

In [74]:
df['state'] = df.contributor.apply(lambda x: x.split('\n')[2].split(',')[1].strip().split(' ')[0] if(len(x.split('\n')) > 2) else None)

In [89]:
df.state.value_counts(dropna=False, normalize=True)

NY     0.947779
       0.028839
CT     0.008574
FL     0.006235
NC     0.002338
GA     0.001559
NJ     0.001559
TN     0.000779
DC     0.000779
CN     0.000779
NaN    0.000779
Name: state, dtype: float64

In [80]:
df[['state', 'year']].groupby(['year']).state.value_counts(normalize=True)

year  state
2006  NY       0.915493
               0.042254
      FL       0.028169
      DC       0.014085
2007  NY       0.950530
               0.028269
      FL       0.010601
      CT       0.007067
      NC       0.003534
2008           0.500000
      NY       0.500000
2009  NY       0.895238
               0.047619
      CT       0.028571
      FL       0.019048
      NC       0.009524
2011  NY       0.936937
               0.027027
      CT       0.027027
      NC       0.009009
2013  NY       0.940789
               0.046053
      CT       0.013158
2014  NY       0.750000
               0.250000
2015  NY       0.962457
               0.020478
      CN       0.003413
      CT       0.003413
      FL       0.003413
      GA       0.003413
      NJ       0.003413
2016  NY       1.000000
2017  NY       0.978417
               0.007194
      NJ       0.007194
      TN       0.007194
2018  NY       1.000000
2019  NY       0.981308
               0.009346
      GA       0.009346
Name

### Extracting Zip

In [76]:
df['zipcode'] = df.contributor.apply(lambda x: x.split('\n')[2].split(',')[1].strip().split(' ')[1] if(len(x.split('\n')) > 2 and len(x.split('\n')[2].split(',')) > 1 and len(x.split('\n')[2].split(',')[1].strip().split(' ')) > 1) else None)

In [82]:
df['zipcode_in_greenburgh'] = df.zipcode.isin(['10502', '10522', '10530', '10583', '10595', '10606', '10706', '10503', '10523', '10533', '10591', '10603', '10607'])

In [86]:
df.zipcode_in_greenburgh.value_counts(normalize=True, dropna=False)

True     0.787997
False    0.212003
Name: zipcode_in_greenburgh, dtype: float64

In [88]:
df.zipcode.value_counts(dropna=False)
df.zipcode.shape

(1283,)

In [84]:
df[['zipcode_in_greenburgh', 'zipcode']].groupby(['zipcode_in_greenburgh']).zipcode.value_counts(normalize=True)

zipcode_in_greenburgh  zipcode   
False                  10601         0.068670
                       10605         0.060086
                       10536         0.055794
                       10577         0.051502
                       10025         0.038627
                       10709         0.038627
                       10023         0.030043
                       10506         0.030043
                       10956         0.030043
                       10504         0.025751
                       10520         0.025751
                       06776         0.021459
                       10001         0.021459
                       10514         0.017167
                       10549         0.017167
                       10570         0.017167
                       10960         0.017167
                       11374         0.017167
                       06903         0.012876
                       10019         0.012876
                       10036         0.012876


In [77]:
df.head()

Unnamed: 0,contributor,amount,contribution_date,report,schedule,year,schedule_description,company,relevant_links,comments,address,street_address,city,state,zipcode
0,"CANNON, DONALD W\n40 PINERIDGE ROAD\nWHITE PLA...",2000.0,2019-06-21,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,Another Nine LLC (CFO),https://www.mylife.com/donald-cannon/e98303177148,,"40 PINERIDGE ROAD WHITE PLAINS, NY 10603",40 PINERIDGE ROAD,WHITE PLAINS,NY,10603
1,"GUGGENHEIMER, PETER \n15 CHEDWORTH ROAD\nSCARS...",1000.0,2019-06-19,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,Guggenheimer Architects,https://www.mylife.com/peter-guggenheimer/e340...,,"15 CHEDWORTH ROAD SCARSDALE, NY 10583",15 CHEDWORTH ROAD,SCARSDALE,NY,10583
2,"GRALLA, YVETTE \n1017 SAW MILL RIVER ROAD\nARD...",1000.0,2019-06-17,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,,,,"1017 SAW MILL RIVER ROAD ARDSLEY, NY 10502",1017 SAW MILL RIVER ROAD,ARDSLEY,NY,10502
3,", CONSTRUCTION INDUSTRY COUNCIL NYS PAC\n629 O...",300.0,2019-06-24,2019 10 Post Primary,C,2019,Monetary Contributions/All Other,,,,"629 OLD WHITE PLAINS ROAD TARRYTOWN, NY 10591",629 OLD WHITE PLAINS ROAD,TARRYTOWN,NY,10591
4,"HERMANN, ROBERT \n46 TOPLAND RD\nHARTSDALE, NY...",250.0,2019-06-13,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,,https://www.mylife.com/robert-hermann/e3124970...,,"46 TOPLAND RD HARTSDALE, NY 10530",46 TOPLAND RD,HARTSDALE,NY,10530


In [36]:
reports = []
for index, row in df.iterrows():
    report = row.to_frame().transpose()
    result = geocode(row['address'])
    if result:
        lat, long = result
        report['latitude'] = lat
        report['longitude'] = long
        reports.append(report)
        final_output = pd.concat(reports)
        final_output.to_csv(output_filename_campaign_fin, index=False)

40 PINERIDGE ROAD WHITE PLAINS, NY 10603
15 CHEDWORTH ROAD SCARSDALE, NY 10583
1017 SAW MILL RIVER ROAD ARDSLEY, NY 10502
629 OLD WHITE PLAINS ROAD TARRYTOWN, NY 10591
46 TOPLAND RD HARTSDALE, NY 10530
80 HIGH POINT ROAD SCARSDALE, NY 10583
95 STONE AVE. WHITE PLAINS, NY 10603
268 SOUTH MAIN STREET NEW CITY, NY 10956
14 VERNE PLACE HARTSDALE, NY 10530
317 ARDSLEY ROAD SCARSDALE, NY 10583
353 MOUNTAIN ROAD IRVINGTON, NY 10533
18 FAIRVIEW ROAD SCARSDALE, NY 10583
120 CARTHAGE ROAD SCARSDALE, NY 10583
28 RIDGE ROAD ARDSLEY, NY 10602
4 UXBRIDGE ROAD SCARSDALE, NY 10583
10 WOODS END LANE HARTSDALE, NY 10530
17 BOULDER RIDGE ROAD SCARSDALE, NY 10583
34 CENTURY RIDGE ROAD PURCHASE, NY 10577
166 HUNTLEY DRIVE HARTSDALE, NY 10530
12 WOODBINE ROAD IRVINGTON, NY 10533
89 PARKVIEW ROAD ELMSFORD, NY 10523
1 ANNE'S WOOD LANE MOUNT KISCO, NY 10549
15 RIDGE ROAD HARTSDALE, NY 10530
22 BOULDER RIDGE ROAD SCARSDALE, NY 10583
55 GRASSLANDS ROAD - B121 VALHALLA, NY 10595
99 COURT STREET WHITE PLAINS, NY 1

169 STONEOA OAKS DR. HARTSDALE, NY 10530
25 WYLDWOOD DRIVE TARRYTOWN, NY 10591
14 BONNIE BRIAR RD WHITE PLAINS, NY 10607
53 STANLEY AVE. HASTINGS ON HUDSON, NY 10706
BOX 742 ARDSLEY, NY 10502
355 OLD TARRYTOWN ROAD WHITE PLAINS, NY 10603
414 BENEDICT AVE, APT 3B TARRYTOWN, NY 10591
72 WOODLANDS AVE. WHITE PLAINS, NY 10607
17 PHEASANT RUN SCARSDALE, NY 10583
39 VICTORIA ARDSLEY, NY 10502
18 WILSON HARTSDALE, NY 10530
35 CLAREWOOD DRIVE HASTINGS ON HUDSON, NY 10706
2 DELLWOOD LANE ARDSLEY, NY 10502
565 BROADWAY HASTINGS ON HUDSON, NY 10706
271 OLD ARMY ROAD SCARSDALE, NY 10583
83 COLUMBIA HARTSDALE, NY 10530
22 BALMORAL CRESC WHITE PLAINS, NY 10607
200 HIGH POINT DRIVE, APT. 215 HARTSDALE, NY 10530
47 BEECHWOOD ROAD HARTSDALE, NY 10530
48 STONEWALL CIRCLE WHITE PLAINS, NY 10607
37 VICTORIA ROAD ARDSLEY, NY 10502
88 SUMMIT HASTINGS ON HUDSON,, NY 10706
40 REVERE ARDSLEY, NY 10502
116 HEATH PLACE HASTINGS-ON-HUDSON, NY 10706
90 EDGEWOLD WHITE PLAINS, NY 10607
120 NORTH BROADWAY IRVINGTON, 

1 CHESTER TER HASTINGS, NY 10706
125 BELLAIR DOBBS FERRY, NY 10522
26 COTTONTAIL LANE IRVINGTON, NY 10533
13 VALLEYVIEW ROAD ELMSFORD, NY 10523
241 S BUCKHOUT IRVINGTON, NY 10533
PO BOX 742 ARDSLEY, NY 10502
38 BIRCHWOOD LANE HARTSDALE, NY 10530
16 GRANADA CRESC WHITE PLAINS, NY 10607
14 BONNIE BRIAR ROAD WHITE PLAINS,, NY 10607
55 GRASSLANDS ROAD VALHALLA, NY 10595
200 HIGH POINT DRIVE HARTSDALE, NY 10530
29 DUNHAM ROAD HARTSDALE, NY 10530
191 BROADWAY HASTINGS ON HUDSON, NY 10706
14 PHEASANT RUN SCARSDALE, NY 10583
48 STONEWALL CIRCLE WHITE PLAINS, NY 10607
116 HEATH PLACE HASTINGS ON HUDSON, NY 10706, NY 10706
144 SOUTHLAWN DOBBS FERRY,, NY 10522
2 CATSKILL PLACE ELMSFORD, NY 10523
29 RUMBROOK ROAD ELMSFORD, NY 10523
565 BROADWAY, APT. 4H HASTINGS-ON-HUDSON, NY 10706
39 VICTORIA ARDSLEY, NY 10502
112 LOCUST IRVINGTON, NY 10533
18 COUNTRY CLUB DR WHITE PLAINS, NY 10607
17 SOUTHWAY HARTSDALE, NY 10530
PO BOX 742 ARDSLEY, NY 10502
1 HILLSTOP LANE WHITE PLAINS, NY 10607
16 BIRCHWOOD HAR

5 MOHICAN LANE IRVINGTON, NY 10533
57 JUDSON AVENUE DOBBS FERRY, NY 10522
75 WEST HARTSDALE AVENUE HARTSDALE, NY 10530
9 DERHILL ROAD SCARSDALE, NY 10583
321 OLD CEDAR ROAD HARTSDALE, NY 10530
12 WESTWAY HARTSDALE, NY 10530
4 WHITEWOOD ROAD WHITE PLAINS, NY 10603
25 DUNHAM ROAD SCARSDALE, NY 10530
75 NORTH CENTRAL PARK AVENUE HARTSDALE, NY 10530
18 LINDEN PLACE VALHALLA, NY 10595
99 COURT STREET WHITE PLAINS, NY 10601
3 THOMAS STREET SCARSDALE, NY 10583
14 WESTWAY HARTSDALE, NY 10530
118 EDGARS LANE HASTINGS, NY 10706
98 WINDOM STREET WHITE PLAINS, NY 10607
31 FIELDSTONE DRIVE HARTSDALE, NY 10530
1 MIDWAY ROAD WHITE PLAINS, NY 10607
140 LINCOLN AVENUE PURCHASE, NY 10577
416 BENEDICT AVENUE TARRYTOWN, NY 10591
46 TOPLAND ROAD HARTSDALE, NY 10530
72 CLARENDON ROAD SCARSDALE, NY 10583
7 DOWS LANE IRVINGTON, NY 10533
330 WEST 28TH STREET NEW YORK, NY 10001
15 RIDGE ROAD HARTSDALE, NY 10530
258 EVANDALE ROAD SCARSDALE, NY 10583
3 HIGH POINT TERRACE SCARSDALE, NY 10583
5 GRACE TERRACE DOBBS 

75 NORTH CENTRAL PARK AVENUE HARTSDALE, NY 10530
84 PINEWOOD ROAD HARTSDALE, NY 10530
120 EAST HARTSDALE AVENUE HARTSDALE, NY 10530
31 FIELDSTONE DRIVE HARTSDALE, NY 10530
34 CENTURY RIDGE ROAD PURCHASE, NY 10577
9 ISLAND AVENUE MIAMI BEACH, FL 33139
31 HEMLOCK RIDGE ROAD NEW MILFORD, CT 06776
110 HOLMES AVENUE HARTSDALE, NY 10530
321 OLD CEDAR ROAD HARTSDALE, NY 10530
120 CARTHAGE ROAD SCARSDALE, NY 10583
15 FRANCINE COURT WHITE PLAINS, NY 10607
115 CENTRAL PARK WEST NEW YORK, NY 10023
18 LINDEN PLACE VALHALLA, NY 10593
10 RIDGE ROAD HARTSDALE, NY 10530
400 WEST END AVENUE NEW YORK, NY 10024
25 LAKEVIEW AVENUE HARTSDALE, NY 10530
169 WEST MT AIRY ROAD CROTON ON HUDSO, NY 10520
1 NORTH BROADWAY WHITE PLAINS, NY 10607
770 HARTSDALE ROAD WHITE PLAINS, NY 10607
171 SEARS AVENUE ELMSFORD, NY 10523
353 MOUNTAN ROAD IRVINGTON, NY 10533
14 VERNE PLACE HARTSDALE, NY 10530
60 SUTTON PLACE NEW YORK, NY 10022
46 TOPLAND ROAD HARTSDALE, NY 10530
31 HEMLOCK RIDGE ROAD NEW MILFORD, NY 06776
2 FIFTH 

33 CLARENDON ROAD SCARSDALE, NY 10583
45 MIDVALE ROAD HARTSDALE, NY 10530
298 EVANDALE ROAD SCARSDALE, NY 10583
70 OLD ARMY ROAD SCARSDALE, NY 10583
6 HASTINGS CLOSE HASTINGS, NY 10706
169 WEST MOUNT AIRY ROAD CROTON ON HUDSO, NY 10520
MULLIGAN LANE IRVINGTON, NY 10533
217 CHELSEA ROAD WHITE PLAINS, NY 10603
60 JUDSON AVENUE DOBBS FERRY, NY 10522
133 FIELDPOINT DRIVE IRVINGTON, NY 10533
23 CANTERBURY ROAD WHITE PLAINS, NY 10607
317 ARDSLEY ROAD SCARSDALE, NY 10583
3 HIGH POINT TERRACE SCARSDALE, NY 10583
29 OLD TARRYTOWN ROAD GREENBURGH, NY 10603
120 CARTHAGE ROAD SCARSDALE, NY 10583
126 WALLGROVE AVENUE DOBBS FERRY, NY 10522
15 CLARENDON PLACE SCARSDALE, NY 10583
4 WHITEWOOD ROAD WHITE PLAINS, NY 10603
100 EAST HARTSDALE AVENUE HARTSDALE, NY 10530
1070 DOBBS FERRY ROAD WHITE PLAINS, NY 10607
317 ARDSLEY ROAD SCARSDALE, NY 10583
35 SCENIC DRIVE HASTINGS, NY 10706
6 HASTINGS CLOSE SOUTH HASTINGS, NY 10706
770 HARTSDALE ROAD WHITE PLAINS, NY 10607
101 JOYCE ROAD HARTSDALE, NY 10530
95 ST

Status code 429 from https://nominatim.openstreetmap.org/search: ERROR - 429 Client Error: Too Many Requests for url: https://nominatim.openstreetmap.org/search?q=3+LEISURE+FARM+ROAD+ARMONK%2C+NY+10504&format=jsonv2&addressdetails=1&limit=1


TypeError: 'NoneType' object is not subscriptable

In [37]:
final_output.head()

Unnamed: 0,contributor,amount,contribution_date,report,schedule,year,schedule_description,company,relevant_links,comments,address,latitude,longitude
2,"GRALLA, YVETTE \n1017 SAW MILL RIVER ROAD\nARD...",1000,2019-06-17,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,,,,"1017 SAW MILL RIVER ROAD ARDSLEY, NY 10502",41.012746,-73.846732
3,", CONSTRUCTION INDUSTRY COUNCIL NYS PAC\n629 O...",300,2019-06-24,2019 10 Post Primary,C,2019,Monetary Contributions/All Other,,,,"629 OLD WHITE PLAINS ROAD TARRYTOWN, NY 10591",41.064303,-73.842956
6,"WASHINGTON, LEOLA \n95 STONE AVE.\nWHITE PLAIN...",250,2019-07-01,2019 10 Post Primary,A,2019,Monetary Contributions/Individual & Partnerships,,https://www.mylife.com/leola-washington/e47329...,,"95 STONE AVE. WHITE PLAINS, NY 10603",41.057311,-73.775875
7,"RAND, MARSHA \n268 SOUTH MAIN STREET\nNEW CITY...",250,2019-07-05,2019 July Periodic,A,2019,Monetary Contributions/Individual & Partnerships,,,,"268 SOUTH MAIN STREET NEW CITY, NY 10956",41.121452,-73.992796
9,"WOLTZ, MICHAEL \n317 ARDSLEY ROAD\nSCARSDALE, ...",250,2019-06-08,2019 11 Pre Primary,A,2019,Monetary Contributions/Individual & Partnerships,,https://www.mylife.com/michael-woltz/e53320812...,,"317 ARDSLEY ROAD SCARSDALE, NY 10583",40.988852,-73.809934
