# 시카고 샌드위치 맛집 분석

### 시카고 샌드위치 맛집 사이트에 접근하기

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import pandas as pd

In [2]:
url_base = 'http://www.chicagomag.com'
url_sub = '/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
url = url_base + url_sub

html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [3]:
rest_list = soup.find_all('div', 'sammy')
rank = rest_list[0].find('div', 'sammyRank').string
sammy_list = rest_list[0].select_one('.sammyListing')
link = sammy_list.a.attrs['href']
tmp = rest_list[0].find('a').get_text()
tmp.split('\n')

['BLT\r', 'Old Oak Tap', 'Read more ']

In [4]:
menu = tmp.split('\n')[0].replace('\r', '')
cafe = tmp.split('\n')[1]
menu, cafe

('BLT', 'Old Oak Tap')

In [5]:
rank_list = []; link_list = []; menu_list = []; cafe_list = []
for rest in rest_list:
    rank = int(rest.find('div', 'sammyRank').string)
    link = urljoin(url_base, rest.select_one('.sammyListing').a['href'])          # urljoin: url_base가 이미 있는 경우에도 자동으로 full-path를 만들어 준다
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r', '')
    cafe = tmp[1]
    
    rank_list.append(rank); link_list.append(link)
    menu_list.append(menu); cafe_list.append(cafe)

In [6]:
df = pd.DataFrame({
    'Rank': rank_list,
    'Cafe': cafe_list,
    'Menu': menu_list,
    'Link': link_list
})
df = df.set_index('Rank')
df

Unnamed: 0_level_0,Cafe,Menu,Link
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
7,Acadia,Lobster Roll,http://www.chicagomag.com/Chicago-Magazine/Nov...
8,Birchwood Kitchen,Smoked Salmon Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
9,Cemitas Puebla,Atomica Cemitas,http://www.chicagomag.com/Chicago-Magazine/Nov...
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [7]:
df.to_csv('Chicago Sandwich.csv', sep=',', encoding='utf8')
df['Link'][49]

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Toni-Patisserie-Le-Vegetarien/'

### 다수의 웹페이지에 자동으로 접근해서 원하는 정보 가져오기

In [8]:
html = urlopen(df['Link'][1])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [9]:
soup_tmp.select_one('.addy').get_text()

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [10]:
tmp = soup_tmp.select_one('.addy').get_text().split(',')

In [11]:
' '.join(tmp[1:-2]).replace(',', '')

''

In [12]:
price = tmp[0].split()[0]
price = price[:-1]
price

'$10'

In [13]:
addr = ' '.join(tmp[0].split()[1:])
addr

'2109 W. Chicago Ave.'

In [14]:
tmp[1].strip()

'773-772-0406'

In [15]:
from tqdm import tqdm_notebook

In [16]:
rank_list = []; link_list = []; menu_list = []; cafe_list = []
price_list = []; addr_list = []; tel_list = []; hp_list = []
for rest in tqdm_notebook(rest_list):
    rank = int(rest.find('div', 'sammyRank').string)
    link = urljoin(url_base, rest.select_one('.sammyListing').a['href'])          
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r', '')
    cafe = tmp[1]   
    rank_list.append(rank); link_list.append(link)
    menu_list.append(menu); cafe_list.append(cafe)

    html = urlopen(link)
    soup_tmp = BeautifulSoup(html, 'html.parser')
    tmp = soup_tmp.select_one('.addy').get_text().split(',')
    price = tmp[0].split()[0][:-1]
    price_list.append(price)
    addr = ' '.join(tmp[0].split()[1:])
    if len(tmp) == 1:
        addr_list.append(addr)
        tel_list.append('')
        hp_list.append('')
    elif len(tmp) == 2:
        if addr.find('Multiple') >= 0:
            addr_list.append(' ')
            tel_list.append('')
            hp_list.append(tmp[1])
        else:
            addr_list.append(addr)
            tel_list.append(tmp[1].strip())
            hp_list.append('')
    elif len(tmp) == 3:
        addr_list.append(addr)
        tel_list.append(tmp[1].strip())
        hp_list.append(tmp[2])
    elif len(tmp) == 4:
        addr += ', ' + tmp[1]
        addr_list.append(addr)
        tel_list.append(tmp[2].strip())
        hp_list.append(tmp[3])

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [17]:
len(rank_list), len(cafe_list), len(menu_list), len(price_list), len(addr_list), len(tel_list), len(hp_list)

(50, 50, 50, 50, 50, 50, 50)

In [18]:
df2 = pd.DataFrame({
    'Rank': rank_list,
    'Cafe': cafe_list,
    'Menu': menu_list,
    'Price': price_list,
    'Address': addr_list,
    'Telephone': tel_list,
    'Home Page': hp_list
})
df2

Unnamed: 0,Rank,Cafe,Menu,Price,Address,Telephone,Home Page
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,773-772-0406,theoldoaktap.com
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,312-929-4580,aucheval.tumblr.com
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,312-334-3688,rickbayless.com
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston",847-475-9400,alsdeli.net
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,312-445-8977,publicanqualitymeats.com
5,6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,$7.25,100 E. Walton St.,312-649-6717,
6,7,Acadia,Lobster Roll,$16,1639 S. Wabash Ave.,312-360-9500,acadiachicago.com
7,8,Birchwood Kitchen,Smoked Salmon Salad,$10,2211 W. North Ave.,773-276-2100,birchwoodkitchen.com
8,9,Cemitas Puebla,Atomica Cemitas,$9,3619 W. North Ave.,773-772-8435,cemitaspuebla.com
9,10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,$17,3267 S. Halsted St.,312-929-2486,nanaorganic.com


In [19]:
df2.to_csv('df2.csv')

### 맛집 위치를 지도에 표기하기

In [20]:
import numpy as np
import folium
import googlemaps

In [21]:
key_fd = open('googlemapskey.txt', mode='r')
gmaps_key = key_fd.read(100)
key_fd.close()

In [22]:
gmaps = googlemaps.Client(key=gmaps_key)

In [23]:
lat = []
lng = []

for n in tqdm_notebook(df2.index):
    if df2['Address'][n].find(' ') != 0:
        target_name = df2['Address'][n]+', '+'Chicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])

    else:
        lat.append(np.nan)
        lng.append(np.nan)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [24]:
print(lat)

[41.8956049, 41.8846582, 41.8905226, 42.0583217, 41.8866036, 41.9002501, 41.8590541, 41.9102031, 41.9097558, 41.8345302, 41.9276207, nan, 41.9384419, 41.9451044, 41.930109, 41.89129000000001, 41.8678529, 41.8852691, 41.9080539, 41.91369539999999, 41.9537106, nan, 41.9794496, 41.9541563, nan, 42.156691, nan, 41.9652987, 41.90272179999999, 41.8893683, nan, 41.9105258, 41.8896188, 41.91504990000001, 41.9218521, 41.9797099, 41.9617122, 41.89296119999999, nan, 41.9047551, 41.7913185, 42.2518352, 41.9152875, 41.8863622, 41.8758102, 41.8960738, 41.89897850000001, 41.9105832, 41.8831061, 41.9431632]


In [25]:
df2['lat'] = lat
df2['lng'] = lng

In [26]:
mapping = folium.Map(location=[df2['lat'].mean(), df2['lng'].mean()], zoom_start=11)
folium.Marker([df2['lat'].mean(), df2['lng'].mean()], popup='center').add_to(mapping)
mapping

In [27]:
mapping = folium.Map(location=[df2['lat'].mean(), df2['lng'].mean()], zoom_start=11)

for n in df2.index:
    if df2['Address'][n].find(' ') != 0:
        folium.Marker([df2['lat'][n], df2['lng'][n]], popup=df2['Cafe'][n]).add_to(mapping)
mapping