Skip to content

Commit

Permalink
Merge pull request #15 from zluo16/scrape-dec-website
Browse files Browse the repository at this point in the history
parse static file from dec website
  • Loading branch information
moajzashahab committed Mar 11, 2018
2 parents b067183 + d30c95f commit 824bf52
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 0 deletions.
Binary file added data/dec_data.p
Binary file not shown.
1 change: 1 addition & 0 deletions data/dec_data.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/gmnysmeddropbox.csv

Large diffs are not rendered by default.

Binary file added data/gmnysmeddropbox.kmz
Binary file not shown.
Binary file added data/gmnysmeddropbox.xlsx
Binary file not shown.
60 changes: 60 additions & 0 deletions scripts/parse_dec_ksv_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import csv
import os.path
from bs4 import BeautifulSoup
import pickle
import json
from collections import OrderedDict


my_path = os.path.abspath(os.path.dirname(__file__))
in_fname = os.path.join(my_path, '../data/gmnysmeddropbox.csv')
p_fname = os.path.join(my_path, '../data/dec_data.p')
j_fname = p_fname = os.path.join(my_path, '../data/dec_data.txt')

location_to_location_type = OrderedDict([(('College'), 'School'), (('Hospital', 'VA'), 'Hospital'),
(('Duane Reade', 'Walgreens', 'Pharmacy', 'CVS', 'Rx', 'Drug', 'Medicine Shoppe'), 'Pharmacy'),
(('Police', 'Sheriff', 'Public Safety'), 'Police'),
(('County', 'Town Hall', 'Municipal', 'Village'), 'Government'),
(('Army', 'National Guard', 'Naval', 'Ft.'), 'Military'),
(('Family Medicine'), "Doctor's Office"),
(('ecopark'), 'Recycling Center')])

def determine_location_type(location):
for location_list in location_to_location_type:
for s in location_list:
if s in location:
return location_to_location_type[location_list]

def parse_description(html_desc):
soup = BeautifulSoup(html_desc, 'html.parser').get_text()
key_val_strings = soup.split('NYS Medication Drop Box')[1].split('\n')
key_val_strings = [st for st in key_val_strings if len(st) > 1]
list_of_arrs = [key_val_string.split(': ') for key_val_string in key_val_strings]
for i, arr in enumerate(list_of_arrs):
if len(arr) > 2:
new_arr = [arr[0]] + [': '.join(arr[1:])]
list_of_arrs[i] = new_arr
list_of_arrs = [l for l in list_of_arrs if len(l) > 1]
location_data = dict(list_of_arrs)
location_data['Type'] = determine_location_type(location_data['Location'])
return location_data

with open(in_fname, 'r') as f:
reader = csv.DictReader(f)
record_out = []
for record in reader:
description = parse_description(record['description'])
print(description)
record_out.append(description)
with open(p_fname, 'wb') as g:
pickle.dump(record_out, g)
with open(j_fname, 'w') as h:
json.dump(record_out, h)








0 comments on commit 824bf52

Please sign in to comment.