-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
136 lines (116 loc) · 4.36 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
crawler code
"""
import csv
import re
import json
import bs4
import requests
START_URL_FP = "https://www.foodpantries.org/ci/il-chicago"
START_URL_S = ("https://www.homelessshelterdirectory.org/cgi-bin"
"/id/city.cgi?city=Chicago&state=IL")
STR_TO_REPLACE = ['\r', '\n', '\t', '<b>', '<br>', '</b>', '@', '\\',\
'â', '\x80', '\x99s', '\xa0', '[read more]']
def generate_soup(url):
'''
Given a url, then generate relevant soup object.
Input:
url: (str) url of a page
Return:
soup object
'''
response = requests.get(url)
html = response.text
return bs4.BeautifulSoup(html, "html.parser")
def crawl_food_pantry(output):
'''
Crawl the websites of food pantries and extract relevant information.
Input:
output: (str) filename of the file storing data extracted
from websites
'''
start_soup = generate_soup(START_URL_FP)
pantries = generate_dict_per_pantry(start_soup)
with open(output, 'w') as file:
colnames = ['facility_name', 'address', \
'phone_number', 'zipcode', 'service_type', 'notes']
writer = csv.writer(file)
writer.writerow(colnames)
for p in pantries:
p_address = p['address']
row = [p['name'], p_address['streetAddress'], p['telephone'], \
p_address['postalCode'], 'food pantry', p['description']]
writer.writerow(row)
def generate_dict_per_pantry(soup):
'''
Given the soup object of main page, then generate a list of dictionary
for every food pantry.
Input:
soup: (object) soup of main page
Return:
list of dictionaries of food pantries
'''
pantries = []
pantries_html = soup.find_all('script', type='application/ld+json')
for ph in pantries_html:
pantry_text = ph.text
for s in STR_TO_REPLACE:
pantry_text = pantry_text.replace(s, '')
pantry = json.loads(pantry_text)
if pantry['type'] == 'LocalBusiness':
pantries.append(pantry)
return pantries
def crawl_shelter(output):
'''
Crawl the websites of shelters and extract relevant information.
Input:
output: (str) filename of the file storing data extracted
from websites
'''
start_soup = generate_soup(START_URL_S)
shelters = generate_dict_per_shelter(start_soup)
with open(output, 'w') as file:
colnames = ['facility_name', 'address', 'phone_number', \
'zipcode', 'service_type', 'notes']
writer = csv.writer(file)
writer.writerow(colnames)
for s in shelters:
row = [s['name'], s['address'], s['phone_number'], \
s['zipcode'], 'shelter', s['website']]
writer.writerow(row)
def generate_dict_per_shelter(soup):
'''
Given the soup object of main page, then generate a list of dictionary
for every shelter.
Input:
soup: (object) soup of main page
Return:
list of dictionaries of shelters
'''
shelters = []
shelter_html = soup.find_all('div', class_='item_content')
for sh in shelter_html:
shelter = {'website': ''}
if sh.find('a', href=True):
url = sh.find('a', href=True)['href']
if 'shelter=' in url:
url_soup = generate_soup(url)
shelter['name'] = re.findall('.*(?=\s-)', \
url_soup.find('h3', class_ = 'entry_title').text)[0]
contact = url_soup.find('h4').next_sibling.next_sibling.text
for s in STR_TO_REPLACE:
contact = contact.replace(s, '')
contact = contact.strip().split(":")
shelter['address'] = re.findall('.*(?=\s{5})', \
contact[0])[0].strip()
shelter['zipcode'] = re.findall('\d{5}', \
contact[0])[0].strip()
shelter['phone_number'] = contact[1].strip()
for i in contact:
if "www" in i or ".com" in i or "org" in i:
shelter['website'] = i.strip('/')
shelters.append(shelter)
return shelters
if __name__ == '__main__':
crawl_food_pantry('food_pantry.csv')
crawl_shelter('shelter.csv')