In [1]:
import pandas as pd
import numpy as np
import pickle
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## Create address column 

In [2]:
#load address data 
address_data = pd.read_csv('../data/sc122a.txt', delimiter = '\t', header = 0, encoding='latin-1')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
street = address_data['LSTREE']
city = address_data['LCITY']
zipcode = list(map(str, address_data['LZIP'])) #convert all int in LZIP to strings so they can be appended 
state = address_data['LSTATE']

In [4]:
def make_address(street, city, state, zipcode): 
    '''Returns a new list of complete addresses with combined street, city, state, and zipcode 
    '''
    addresses = [] #empty list to fill with full addresses
    for i in np.arange(len(street)): 
        full_address = street[i].lower() + ' ' + city[i].lower() + ' ' + state[i].lower() + ' ' + zipcode[i].lower() #for every street in list of streets, append city, state, and zip to create full address
        addresses.append(full_address) #append full address into empty list of addresses
    return addresses 

In [5]:
full_address = make_address(street, city, state, zipcode) #list of full addresses 

In [6]:
address_data['ADDRESS13'] = full_address #add column of full addresses

In [7]:
#select only NCESSCH and ADRESS13 columns because we're joining on NCESSCH and we only want to add the ADDRESS13 values to charters_2015.pkl
address_NCESSCH_data = address_data[['NCESSCH', 'ADDRESS13']] 

## Match formatting of addresses and names in CER dataframe to formatting of new address data 

In [8]:
CER = pd.read_csv('CER_2012-13.csv', encoding = 'latin-1') #open CER file 

In [9]:
#loop through every address, removing commas and periods, and converting all letters to lowercase
for i in np.arange(len(CER['CER_ADDRESS'])):
    CER.loc[i,'CER_ADDRESS'] = CER.loc[i,'CER_ADDRESS'].replace(',', '').replace('.','').lower() 


In [10]:
#uniform formatting of name by making all characters lowercase and removing all punctuation
CER['CER_NAME'] = [str(name).lower() for name in CER['CER_NAME']]

## Add address column to nonduplicate charter dataframe

In [11]:
#load nonduplicate charter data 
charters_nonduplicate = pd.read_csv('../../nowdata/parsing/overlaps_removed_df.csv', sep = '\t', header = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
#merge addresses to charters_nonduplicate based on NCESSCH 
charters_nonduplicate_with_address = pd.merge(charters_nonduplicate, address_NCESSCH_data, how = 'left', on = 'NCESSCH') 

In [13]:
#match formatting for name and address 
charters_nonduplicate_with_address['SCHNAM12'] = [str(name).lower() for name in charters_nonduplicate_with_address['SCHNAM12']]

charters_nonduplicate_with_address['ADDRESS13'] = [str(address) for address in charters_nonduplicate_with_address['ADDRESS13']]

## Create new column of combined street+address to use as comparison with fuzzy ratio

In [14]:
#combine name and address for charters 
chartername = charters_nonduplicate_with_address['SCHNAM12']
charteraddress = charters_nonduplicate_with_address['ADDRESS13']
charters_nonduplicate_with_address['address_name_combined'] = [name+' '+address for name,address in zip(chartername, charteraddress)]
#charters_nonduplicate_with_address
                                                               
#combine name and address for CER 
CERname = CER['CER_NAME']
CERaddress = CER['CER_ADDRESS']
CER['address_name_combined'] = [name+' '+address for name,address in zip(CERname, CERaddress)]

## Using fuzzy ratio 

1. List fuzzy ratio between name+address columns in both data frames 
2. Keep the highest fuzzy ratio 
3. If fuzzy ratio is over 70, change the column name to its match 

In [15]:

index = 0
for combined_charters in charters_nonduplicate_with_address['address_name_combined']:
    print(index)
    ratio = [fuzz.ratio(combined_charters, combinedCER) for combinedCER in CER['address_name_combined']]
    greatest_match_index = np.asarray(ratio).argmax()
    greatest_match_name = ratio[greatest_match_index]
    if greatest_match_name >= 70:
        print('name match found')
        charters_nonduplicate_with_address.loc[index, 'address_name_combined'] = CER.loc[greatest_match_index, 'address_name_combined']
    index = index + 1
        

0
1
name match found
2
name match found
3
name match found
4
name match found
5
name match found
6
7
8
name match found
9
name match found
10
name match found
11
name match found
12
13
name match found
14
15
name match found
16
name match found
17
name match found
18
name match found
19
name match found
20
name match found
21
name match found
22
23
name match found
24
25
name match found
26
name match found
27
28
name match found
29
name match found
30
31
name match found
32
name match found
33
name match found
34
35
36
name match found
37
name match found
38
name match found
39
name match found
40
name match found
41
name match found
42
name match found
43
44
45
46
name match found
47
48
name match found
49
50
51
52
name match found
53
name match found
54
name match found
55
name match found
56
57
name match found
58
59
name match found
60
61
62
63
64
65
name match found
66
name match found
67
name match found
68
69
70
71
72
73
74
75
76
77
name match found
78
name match found
79
80
81

name match found
646
name match found
647
name match found
648
name match found
649
name match found
650
651
652
name match found
653
name match found
654
name match found
655
name match found
656
657
658
659
660
name match found
661
name match found
662
name match found
663
664
665
name match found
666
667
name match found
668
name match found
669
name match found
670
name match found
671
name match found
672
673
674
675
name match found
676
677
678
679
680
name match found
681
name match found
682
name match found
683
name match found
684
name match found
685
name match found
686
name match found
687
688
689
name match found
690
name match found
691
692
693
name match found
694
695
name match found
696
name match found
697
name match found
698
699
name match found
700
701
name match found
702
703
704
705
706
name match found
707
name match found
708
name match found
709
name match found
710
name match found
711
name match found
712
name match found
713
name match found
714
715
name m

name match found
1282
name match found
1283
name match found
1284
name match found
1285
name match found
1286
name match found
1287
name match found
1288
name match found
1289
name match found
1290
name match found
1291
name match found
1292
name match found
1293
name match found
1294
name match found
1295
name match found
1296
name match found
1297
name match found
1298
name match found
1299
name match found
1300
1301
name match found
1302
name match found
1303
name match found
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
name match found
1326
1327
1328
1329
name match found
1330
1331
name match found
1332
name match found
1333
name match found
1334
name match found
1335
1336
1337
name match found
1338
1339
1340
1341
name match found
1342
name match found
1343
1344
1345
name match found
1346
1347
1348
1349
name match found
1350
1351
1352
name match found
1353
1354
name match found
1355
name match found
1356
name match fo

name match found
1813
name match found
1814
name match found
1815
name match found
1816
name match found
1817
1818
1819
name match found
1820
1821
1822
1823
name match found
1824
name match found
1825
name match found
1826
name match found
1827
name match found
1828
name match found
1829
name match found
1830
name match found
1831
name match found
1832
1833
name match found
1834
name match found
1835
1836
name match found
1837
name match found
1838
name match found
1839
name match found
1840
name match found
1841
name match found
1842
name match found
1843
name match found
1844
name match found
1845
1846
1847
name match found
1848
name match found
1849
name match found
1850
name match found
1851
name match found
1852
name match found
1853
name match found
1854
name match found
1855
1856
name match found
1857
name match found
1858
name match found
1859
name match found
1860
1861
name match found
1862
name match found
1863
name match found
1864
name match found
1865
name match found
1866

name match found
2345
2346
2347
2348
name match found
2349
name match found
2350
name match found
2351
2352
name match found
2353
2354
name match found
2355
name match found
2356
name match found
2357
2358
name match found
2359
name match found
2360
name match found
2361
2362
name match found
2363
name match found
2364
name match found
2365
2366
name match found
2367
name match found
2368
name match found
2369
name match found
2370
name match found
2371
2372
name match found
2373
name match found
2374
name match found
2375
name match found
2376
2377
name match found
2378
name match found
2379
name match found
2380
name match found
2381
2382
name match found
2383
name match found
2384
name match found
2385
name match found
2386
2387
2388
name match found
2389
name match found
2390
name match found
2391
name match found
2392
name match found
2393
name match found
2394
name match found
2395
name match found
2396
name match found
2397
name match found
2398
name match found
2399
2400
name m

name match found
2892
name match found
2893
name match found
2894
2895
name match found
2896
name match found
2897
2898
name match found
2899
name match found
2900
name match found
2901
2902
name match found
2903
2904
name match found
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
name match found
2920
2921
2922
name match found
2923
name match found
2924
2925
2926
2927
name match found
2928
name match found
2929
name match found
2930
2931
name match found
2932
name match found
2933
name match found
2934
2935
2936
name match found
2937
name match found
2938
name match found
2939
2940
name match found
2941
name match found
2942
name match found
2943
2944
name match found
2945
name match found
2946
name match found
2947
name match found
2948
name match found
2949
2950
2951
2952
2953
2954
name match found
2955
name match found
2956
2957
name match found
2958
2959
2960
2961
name match found
2962
2963
name match found
2964
name match found
2965
name match found
2

name match found
3445
3446
name match found
3447
name match found
3448
name match found
3449
name match found
3450
name match found
3451
name match found
3452
name match found
3453
name match found
3454
3455
3456
name match found
3457
3458
3459
3460
name match found
3461
name match found
3462
3463
name match found
3464
3465
name match found
3466
3467
name match found
3468
name match found
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
name match found
3482
name match found
3483
name match found
3484
name match found
3485
3486
3487
name match found
3488
name match found
3489
3490
name match found
3491
name match found
3492
name match found
3493
name match found
3494
3495
name match found
3496
name match found
3497
name match found
3498
name match found
3499
3500
name match found
3501
3502
3503
3504
name match found
3505
3506
name match found
3507
3508
name match found
3509
name match found
3510
name match found
3511
name match found
3512
3513
3514
name match found
3515

name match found
4013
name match found
4014
name match found
4015
name match found
4016
name match found
4017
name match found
4018
name match found
4019
name match found
4020
name match found
4021
name match found
4022
name match found
4023
name match found
4024
name match found
4025
name match found
4026
name match found
4027
name match found
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
name match found
4050
name match found
4051
4052
4053
name match found
4054
name match found
4055
4056
4057
4058
name match found
4059
name match found
4060
name match found
4061
name match found
4062
name match found
4063
4064
4065
4066
name match found
4067
name match found
4068
4069
4070
4071
name match found
4072
name match found
4073
name match found
4074
name match found
4075
name match found
4076
name match found
4077
name match found
4078
name match found
4079
name match found
4080
name match found
4081
name match found
4082
name

name match found
4605
name match found
4606
4607
4608
4609
name match found
4610
4611
name match found
4612
4613
name match found
4614
name match found
4615
name match found
4616
name match found
4617
4618
name match found
4619
name match found
4620
name match found
4621
name match found
4622
name match found
4623
name match found
4624
name match found
4625
name match found
4626
name match found
4627
4628
name match found
4629
name match found
4630
name match found
4631
name match found
4632
name match found
4633
name match found
4634
name match found
4635
name match found
4636
name match found
4637
name match found
4638
name match found
4639
name match found
4640
name match found
4641
name match found
4642
name match found
4643
name match found
4644
name match found
4645
name match found
4646
name match found
4647
name match found
4648
name match found
4649
name match found
4650
4651
name match found
4652
4653
4654
4655
name match found
4656
name match found
4657
name match found
4658

5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
name match found
5244
name match found
5245
name match found
5246
name match found
5247
name match found
5248
name match found
5249
name match found
5250
name match found
5251
5252
name match found
5253
5254
5255
name match found
5256
name match found
5257
name match found
5258
5259
5260
name match found
5261
name match found
5262
name match found
5263
5264
5265
name match found
5266
name match found
5267
name match found
5268
name match found
5269
name match found
5270
name match found
5271
name match found
5272
name match found
5273
5274
5275
name match found
5276
name match found
5277
name match found
5278
name match found
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
name match found
5289
name match found
5290
name match found
5291
5292
5293
5294
5295
5296
5297
5298
5299
name match found
5300
name match found
5301
name match found
5302
name match found
5303
name match found


5738
name match found
5739
name match found
5740
5741
name match found
5742
name match found
5743
5744
name match found
5745
name match found
5746
name match found
5747
name match found
5748
name match found
5749
name match found
5750
name match found
5751
5752
name match found
5753
name match found
5754
name match found
5755
name match found
5756
name match found
5757
name match found
5758
name match found
5759
name match found
5760
name match found
5761
name match found
5762
name match found
5763
5764
name match found
5765
name match found
5766
name match found
5767
name match found
5768
5769
5770
5771
name match found
5772
name match found
5773
name match found
5774
5775
5776
name match found
5777
5778
name match found
5779
name match found
5780
name match found
5781
name match found
5782
name match found
5783
name match found
5784
name match found
5785
name match found
5786
name match found
5787
name match found
5788
name match found
5789
name match found
5790
name match found
5791

6309
6310
name match found
6311
6312
6313
6314
6315
6316
name match found
6317
name match found
6318
name match found
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
name match found
6366
name match found
6367
name match found
6368
6369
6370
6371
6372
name match found
6373
6374
6375
6376
name match found
6377
name match found
6378
name match found
6379
name match found
6380
name match found
6381
name match found
6382
name match found
6383
name match found
6384
6385
name match found
6386
name match found
6387
6388
name match found
6389
6390
name match found
6391
name match found
6392
6393
name match found
6394
name match found
6395
name match found
6396
name match found
6397
name match found
6398
name match found
6399
name match found
6400
6401
name match found
6402
name match found
6403
6404
name match

name match found
6906
name match found
6907
6908
name match found
6909
name match found
6910
name match found
6911
name match found
6912
name match found
6913
name match found
6914
6915
6916
6917
name match found
6918
name match found
6919
name match found
6920
6921
name match found
6922
6923
name match found
6924
name match found
6925
name match found
6926
6927
name match found
6928
name match found
6929
6930
name match found
6931
name match found
6932
name match found
6933
6934
name match found
6935
name match found
6936
name match found
6937
name match found
6938
6939
name match found
6940
name match found
6941
name match found
6942
6943
name match found
6944
name match found
6945
name match found
6946
name match found
6947
name match found
6948
name match found
6949
name match found
6950
name match found
6951
name match found
6952
name match found
6953
name match found
6954
name match found
6955
name match found
6956
6957
name match found
6958
name match found
6959
name match found

7464
name match found
7465
name match found
7466
7467
7468
7469
7470
7471
name match found
7472
7473
7474
name match found
7475
7476
7477
name match found
7478
name match found
7479
name match found
7480
7481
name match found
7482
7483
name match found
7484
7485
name match found
7486
name match found
7487
name match found
7488
7489
name match found
7490
name match found
7491
7492
name match found
7493
7494
7495
name match found
7496
7497
name match found
7498
name match found
7499
name match found
7500
name match found
7501
name match found
7502
name match found
7503
name match found
7504
name match found
7505
name match found
7506
name match found
7507
name match found
7508
name match found
7509
7510
name match found
7511
7512
7513
7514
name match found
7515
7516
7517
7518
name match found
7519
name match found
7520
name match found
7521
name match found
7522
name match found
7523
name match found
7524
name match found
7525
7526
name match found
7527
name match found
7528
name match f

name match found
8127
name match found
8128
name match found
8129
name match found
8130
name match found
8131
name match found
8132
name match found
8133
8134
8135
name match found
8136
name match found
8137
name match found
8138
8139
name match found
8140
name match found
8141
8142
name match found
8143
name match found
8144
name match found
8145
name match found
8146
name match found
8147
name match found
8148
name match found
8149
name match found
8150
8151
8152
name match found
8153
name match found
8154
name match found
8155
name match found
8156
name match found
8157
name match found
8158
name match found
8159
name match found
8160
name match found
8161
name match found
8162
8163
name match found
8164
name match found
8165
name match found
8166
name match found
8167
name match found
8168
name match found
8169
name match found
8170
name match found
8171
name match found
8172
8173
name match found
8174
name match found
8175
name match found
8176
name match found
8177
8178
name matc

name match found
8684
name match found
8685
name match found
8686
name match found
8687
name match found
8688
name match found
8689
name match found
8690
name match found
8691
name match found
8692
name match found
8693
name match found
8694
8695
name match found
8696
name match found
8697
name match found
8698
name match found
8699
name match found
8700
name match found
8701
name match found
8702
name match found
8703
name match found
8704
name match found
8705
name match found
8706
name match found
8707
name match found
8708
name match found
8709
8710
name match found
8711
name match found
8712
name match found
8713
name match found
8714
name match found
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
name match found
8739
name match found
8740
8741
8742
name match found
8743
8744
8745
name match found
8746
8747
8748
8749
name match found
8750
name match found
8751
name match found
8752
8753
8754
8755
8756
8757
8

9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
name match found
9388
9389
9390
9391
9392
9393
name match found
9394
9395
9396
9397
name match found
9398
9399
name match found
9400
name match found
9401
name match found
9402
9403
name match found
9404
9405
name match found
9406
9407
name match found
9408
name match found
9409
name match found
9410
9411
name match found
9412
9413
name match found
9414
name match found
9415
9416
9417
name match found
9418
name match found
9419
name match found
9420
name match found
9421
name match found
9422
9423
name match found
9424
9425
name match found
9426
9427
9428
9429
name match found
9430
name match found
9431
9432
9433
name match found
9434
name match found
9435
9436
9437
name match found
9438
9439
9440
9441
9442
9443
name match found
9444
9445
9446
name match found
9447
9448
9449
name match found
9450
9451
name match found
9452
9453
9454
9455
9456
9457
9458
9459
9460
name match found
9461
9462
name match found
9463
name match found
9464
name

10042
name match found
10043
name match found
10044
name match found
10045
name match found
10046
name match found
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
name match found
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
name match found
10123
10124
name match found
10125
10126
name match found
10127
10128
10129
10130
10131
10132
name match found
10133
name match found
10134
name match found
10135
name match found
10136
name match found
10137
name match found
10138
name match found
10139
name match found
10140
10141
name match found
10142
name match found
10143
name match found
10144
name match found
10145
10146
10147
name ma

name match found
10658
name match found
10659
name match found
10660
name match found
10661
name match found
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
name match found
10673
10674
10675
10676
10677
name match found
10678
name match found
10679
name match found
10680
name match found
10681
10682
10683
10684
name match found
10685
name match found
10686
name match found
10687
name match found
10688
10689
10690
10691
name match found
10692
name match found
10693
name match found
10694
10695
10696
name match found
10697
name match found
10698
10699
name match found
10700
10701
name match found
10702
name match found
10703
10704
10705
name match found
10706
10707
name match found
10708
10709
name match found
10710
10711
10712
name match found
10713
10714
name match found
10715
10716
10717
name match found
10718
name match found
10719
10720
10721
name match found
10722
10723
10724
name match found
10725
10726
10727
name match found
10728
name match found
10729
name ma

## Merge URL with charter data 

In [16]:
#merge based on combined name and address

charters_merge_CER = pd.merge(charters_nonduplicate_with_address, CER, how = 'left', left_on = ['address_name_combined'], right_on = ['address_name_combined']) 



In [17]:
CER_columns = list(CER.columns) #list of all columns labels in CER
CER_columns.remove('CER_URL') #remove CER_URL from the list 

In [18]:
charters_merge_CER = charters_merge_CER.drop(labels = CER_columns, axis = 1) #drop all columns in CER except CER_URL 

## Check URL matches

In [20]:
len(charters_nonduplicate_with_address)

10965

In [21]:
len(charters_merge_CER['NCESSCH']) #resulting merge has 6 more rows than original charters_nonduplicate

10971

In [22]:
#number of non-duplicate, non-NAN URLs in CER
non_null_or_duplicate_CER_URL = set(CER['CER_URL'].dropna())
len(non_null_or_duplicate_CER_URL)

4810

In [23]:
#number of non-duplicate, non-NAN URL matches
non_null_or_duplicate_vals = set(charters_merge_CER['CER_URL'].dropna())
len(non_null_or_duplicate_vals)

4189

In [32]:
charters_merge_CER[['SCHNAM12', 'CER_URL']].dropna()

Unnamed: 0,SCHNAM12,CER_URL
1,ayaprun elitnaurvik,http://www.yupik.org/#
2,ketchikan charter school,http://kcs.kgbsd.org/#
3,tongass school of arts and sciences charter sc...,http://www.tongassschool.org/#
4,aquarian charter school,http://www.aquariancharterschool.com/#
5,family partnership charter school,http://www.fpcs.net/#
8,winterberry school,http://www.winterberryschool.org/#
9,eagle academy charter school,http://www.eagleacademycharterschool.com/#
10,frontier charter school,http://www.frontierk12.org/#
11,highland tech high charter school,http://www.highlandtech.org/#
13,alaska native cultural charter school,http://www.asdk12.org/schools/anccs/pages/#


In [None]:
#charters_merge_CER.to_csv('../../nowdata/backups/charters_full_250_CER_URLs.pkl')