# 시카고 샌드위치 맛집 분석

### 1. 시카고 샌드위치 맛집 사이트에 접근하기

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [2]:
url_base = 'http://www.chicagomag.com'
url_sub = '/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
url = url_base + url_sub

html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [3]:
rest_list = soup.select('.sammy')
rest_list[0]

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br>
Old Oak Tap<br>
<em>Read more</em> </br></br></a></div>
</div>

In [4]:
len(rest_list)

50

### 2. 접근한 웹 페이지에서 원하는 데이터 추출하고 정리하기

In [5]:
rank = rest_list[0].select_one('.sammyRank').get_text()
rank

'1'

In [6]:
from urllib.parse import urljoin
link = urljoin(url_base,rest_list[0].find('a')['href'])
link

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [7]:
tmp = rest_list[0].find('a').get_text()
tmp

'BLT\r\nOld Oak Tap\nRead more '

In [8]:
tmp.split('\n')

['BLT\r', 'Old Oak Tap', 'Read more ']

In [9]:
menu = tmp.split('\n')[0].replace('\r','')
cafe = tmp.split('\n')[1]
menu, cafe

('BLT', 'Old Oak Tap')

In [10]:
rank_list = []; link_list = []; menu_list = []; cafe_list = []
for rest in rest_list:
    rank = int(rest.select_one('.sammyRank').get_text())
    link = urljoin(url_base,rest.find('a')['href'])
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r','')
    cafe = tmp[1]
    rank_list.append(rank)
    link_list.append(link)
    menu_list.append(menu)
    cafe_list.append(cafe)

In [11]:
import pandas as pd
df = pd.DataFrame({
        'Rank' : rank_list,
        'Cafe' : cafe_list,
        'Menu' : menu_list,
        'Link' : link_list
})
df.head()

Unnamed: 0,Rank,Cafe,Menu,Link
0,1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [12]:
df.to_csv('Chicago.csv',sep=',',encoding='utf8')

### 3. 다수의 웹페이지에 자동으로 접근해서 원하는 정보 가져오기

In [13]:
html = urlopen(df['Link'][0])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [14]:
soup_tmp.select_one('.addy').find('a')

<a href="http://www.theoldoaktap.com/">theoldoaktap.com</a>

In [15]:
tmp = soup_tmp.select_one('.addy').get_text().split(',')
tmp,tmp[0],tmp[-2].replace(',',''),tmp[-1],len(tmp)

(['\n$10. 2109 W. Chicago Ave.', ' 773-772-0406', ' theoldoaktap.com'],
 '\n$10. 2109 W. Chicago Ave.',
 ' 773-772-0406',
 ' theoldoaktap.com',
 3)

In [16]:
tmp[-2].replace(',','')[1:]

'773-772-0406'

In [17]:
price = tmp[0].split()[0][:-1]
price

'$10'

In [18]:
' '.join(tmp[0].split()[1:])

'2109 W. Chicago Ave.'

In [19]:
html = urlopen(df['Link'][5])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [20]:
soup_tmp.select_one('.addy').find('a')

In [21]:
tmp = soup_tmp.select_one('.addy').get_text().split(',')
tmp,tmp[-1][1:],len(tmp)

(['\n$7.25. 100 E. Walton St.', ' 312-649-6717'], '312-649-6717', 2)

In [22]:
price = tmp[0].split()[0][:-1]
price

'$7.25'

In [23]:
' '.join(tmp[0].split()[1:])

'100 E. Walton St.'

In [24]:
html = urlopen(df['Link'][11])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [25]:
tmp = soup_tmp.select_one('.addy').get_text().split(',')
tmp,tmp[0],tmp[-2].replace(',',''),tmp[-1],len(tmp)

(['\n$5.49. Multiple locations', ' ricobenespizza.com'],
 '\n$5.49. Multiple locations',
 '\n$5.49. Multiple locations',
 ' ricobenespizza.com',
 2)

In [26]:
price = tmp[0].split()[0][:-1]
price

'$5.49'

In [27]:
' '.join(tmp[0].split()[1:])

'Multiple locations'

In [28]:
price = tmp[0][:-1]
price

'\n$5.49. Multiple location'

In [29]:
from tqdm import tqdm_notebook

In [33]:
rank_list = []; link_list = []; menu_list = []; cafe_list = []
price_list = []; addr_list = []; tel_list = []; hp_list = []
for rest in tqdm_notebook(rest_list):
    rank = int(rest.select_one('.sammyRank').get_text())
    link = urljoin(url_base,rest.find('a')['href'])
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r','')
    cafe = tmp[1]
    rank_list.append(rank)
    link_list.append(link)
    menu_list.append(menu)
    cafe_list.append(cafe)
    
    html = urlopen(link)
    soup_tmp = BeautifulSoup(html, 'html.parser')
    tmp = soup_tmp.select_one('.addy').get_text().split(',')
    price_list.append(tmp[0].split()[0][:-1])  
    addr = ' '.join(tmp[0].split()[1:])
    if len(tmp) == 3:
        addr_list.append(addr)
        tel_list.append(tmp[-2].replace(',','')[1:])
        hp_list.append(tmp[-1])
    elif len(tmp) == 4:
        addr += ', ' + tmp[1]
        addr_list.append(addr)
        tel_list.append(tmp[-2].replace(',','')[1:])
        hp_list.append(tmp[-1])
    else:
        if soup_tmp.select_one('.addy').find('a'):
            addr_list.append('')
            tel_list.append('')
            hp_list.append(tmp[-1])
        else:
            addr_list.append(addr)
            tel_list.append(tmp[-1][1:])
            hp_list.append('')


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [39]:
df2 = pd.DataFrame({
        'Rank' : rank_list,
        'Cafe' : cafe_list,
        'Menu' : menu_list,
        'Price': price_list,
        'Address' : addr_list,
        'Telephone' : tel_list,
        'Homepage' : hp_list
})
df2.head()

Unnamed: 0,Rank,Cafe,Menu,Price,Address,Telephone,Homepage
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,773-772-0406,theoldoaktap.com
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,312-929-4580,aucheval.tumblr.com
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,312-334-3688,rickbayless.com
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston",847-475-9400,alsdeli.net
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,312-445-8977,publicanqualitymeats.com


In [53]:
df2.to_csv('Chicago2.csv', sep=',',encoding='utf8')

### 3. 맛집 위치를 지도 위에 표시하기

In [40]:
import numpy as np
import folium
import googlemaps

In [41]:
key_fd = open('googleMapsKey.txt', mode='r')
gmaps_key = key_fd.read(100)
key_fd.close()

In [42]:
gmaps = googlemaps.Client(key=gmaps_key)

In [43]:
lat = []
lng = []

for n in tqdm_notebook(df2.index):
    if df2['Address'][n].find(' ') != 0:
        target_name = df2['Address'][n]+', '+'Chicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [44]:
df2['Lattitude'] = lat
df2['Longitude'] = lng
df2.head()

Unnamed: 0,Rank,Cafe,Menu,Price,Address,Telephone,Homepage,Lattitude,Longitude
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,773-772-0406,theoldoaktap.com,41.895605,-87.679961
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,312-929-4580,aucheval.tumblr.com,41.884658,-87.647667
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,312-334-3688,rickbayless.com,41.890523,-87.630783
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston",847-475-9400,alsdeli.net,42.058322,-87.683748
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,312-445-8977,publicanqualitymeats.com,41.886604,-87.648536


In [45]:
mapping = folium.Map(location=[df2['Lattitude'].mean(),
                        df2['Longitude'].mean()], zoom_start=11)
folium.Marker([df2['Lattitude'].mean(),df2['Longitude'].mean()],
                popup='center').add_to(mapping)
mapping

In [46]:
mapping = folium.Map(location=[df2['Lattitude'].mean(),
                        df2['Longitude'].mean()], zoom_start=11)

for n in df2.index:
     if df2['Address'][n].find(' ') != 0:
         folium.Marker([df2['Lattitude'][n],df2['Longitude'][n]],
         popup=df2.Cafe[n]).add_to(mapping)
mapping

In [74]:
df2.to_csv('Chicago3.csv', sep=',',encoding='utf8')