# 웹 크롤링 실습 2/2

#### 과제2 : 서브페이지 정보 크롤링 - 메뉴가격, 카페주소, 전화번호, 홈페이지

In [1]:
import pandas as pd
import time
import re

from bs4 import BeautifulSoup 
from urllib.request import urlopen

#### STEP5 : 메인페이지 크롤링한 데이터 파일 읽어오기
> * 불필요한 칼럼 삭제
* 랭킹정보 인덱스 설정

In [2]:
df = pd.read_csv('data/chicagomag_info.csv', encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,랭킹,메뉴,카페,링크
0,0,1,BLT,Old Oak Tap,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,1,2,Fried Bologna,Au Cheval,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,2,3,Woodland Mushroom,Xoco,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,3,4,Roast Beef,Al’s Deli,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,4,5,PB&L,Publican Quality Meats,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [3]:
# index_col='Unnamed: 0'') 삭제
del df['Unnamed: 0']
df.head()

Unnamed: 0,랭킹,메뉴,카페,링크
0,1,BLT,Old Oak Tap,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Fried Bologna,Au Cheval,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Woodland Mushroom,Xoco,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Roast Beef,Al’s Deli,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,PB&L,Publican Quality Meats,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [4]:
df.set_index('랭킹', inplace=True)
df.head()

Unnamed: 0_level_0,메뉴,카페,링크
랭킹,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,BLT,Old Oak Tap,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,Fried Bologna,Au Cheval,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,Woodland Mushroom,Xoco,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,Roast Beef,Al’s Deli,http://www.chicagomag.com/Chicago-Magazine/Nov...
5,PB&L,Publican Quality Meats,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 50
Data columns (total 3 columns):
메뉴    50 non-null object
카페    50 non-null object
링크    50 non-null object
dtypes: object(3)
memory usage: 1.6+ KB


In [7]:
df['메뉴'][1]

'BLT'

#### STEP6 : 서브페이지의 수집할 데이터 Tag & Class 확인
> 서브페이지 크롤링대상 타겟데이터
* 메뉴가격
* 카페주소 
* 전화번호 
* 홈페이지

In [9]:
site = df['링크'][1]
html = urlopen(site)
soup = BeautifulSoup(html, 'lxml')

In [10]:
# 서브페이지 로딩 확인
soup.title

<title>
  1. Old Oak Tap BLT |
  Chicago magazine
      |  November 2012
    </title>

In [11]:
p_tag = soup.find('p', 'addy')
p_tag

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

In [12]:
print(p_tag.prettify())

<p class="addy">
 <em>
  $10. 2109 W. Chicago Ave., 773-772-0406,
  <a href="http://www.theoldoaktap.com/">
   theoldoaktap.com
  </a>
 </em>
</p>



In [13]:
taginfo = p_tag.get_text()
taginfo

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [15]:
# taginfo = taginfo[taginfo.find('$'):]
taginfo = taginfo.strip()
taginfo

'$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

#### STEP7 : RegEx를 활용하여 분류
> 태그정보로는 분류 안됨
* 메뉴가격 : $로 시작
* 카페주소 : 패턴 안보이므로 일단보류. 가격과 전화번호 사이
* 전화번호 : 이거슨 RegEx, 숫자3-숫자3-숫자4
* 홈페이지 : 요것도 RegEx, .com 혹은 .net 으로 끝남

#### 참조. Regular Expression 관련 사이트
> * http://regexr.com/ : text 정보를 re로 테스트
* https://regexper.com/ : 작성된 re를 다이어그램으로 표현

In [20]:
# 메뉴 가격
menu_price = re.search('\$\d+\.(\d+)?', taginfo).group()
menu_price

'$10.'

In [21]:
# 카페주소
cafe_addr = '일단보류' 

In [22]:
# 전화번호
cafe_tel = re.search('\d{3}[-]\d{3}[-]\d{4}', taginfo)
if cafe_tel is not None:
    cafe_tel = cafe_tel.group()
else:
    cafe_tel = 'No contact'
    
cafe_tel

'773-772-0406'

In [23]:
taginfo

'$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [24]:
info_list = taginfo.split(',')
info_list

['$10. 2109 W. Chicago Ave.', ' 773-772-0406', ' theoldoaktap.com']

# !!

In [25]:
# 홈페이지
if info_list[-1].endswith('.com') or info_list[-1].endswith('.net'):
    home_page = info_list[-1]
else:
    home_page = 'No Homepage'

home_page

' theoldoaktap.com'

In [26]:
info_list[0]

'$10. 2109 W. Chicago Ave.'

In [27]:
menu_price

'$10.'

In [28]:
# 카페주소
idx = len(menu_price)
cafe_addr = info_list[0][idx:].strip()
cafe_addr

'2109 W. Chicago Ave.'

#### STEP8 : 서브페이지 정보 크롤링 모듈
> * 파일의 링크정보에 순차적으로 접속하여 데이터 크롤링
* 메뉴가격, 카페주소, 전화번호, 홈페이지
* 우선 TOP5 페이지정도만 수집후 데이터 검토

In [34]:
for ranking in df.index[:5]:
    sub_link = df['링크'][ranking]
    print('Top{} : {}'.format(ranking, sub_link))

Top1 : http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/
Top2 : http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Au-Cheval-Fried-Bologna/
Top3 : http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Xoco-Woodland-Mushroom/
Top4 : http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Als-Deli-Roast-Beef/
Top5 : http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Publican-Quality-Meats-PB-L/


In [39]:
price_list = list()
cfadd_list = list()
cftel_list = list()
homep_list = list()

for ranking in df.index[:5]:
    sub_link = df['링크'][ranking]
    html = urlopen(sub_link)
    soup = BeautifulSoup(html, "lxml")
    
    p_tag = soup.find('p', 'addy')
    taginfo = p_tag.get_text().strip()
    
    # 메뉴가격
    menu_price = re.search('\$\d+\.(\d+)?', taginfo).group()
    
    # 카페주소
    cafe_addr = '일단보류' 

    # 전화번호
    cafe_tel = re.search('\d{3}[-]\d{3}[-]\d{4}', taginfo)
    if cafe_tel is not None:
        cafe_tel = cafe_tel.group()
    else:
        cafe_tel = 'No Contact'

    info_list = taginfo.split(', ')

    # 홈페이지
    if info_list[-1].endswith('.com') or info_list[-1].endswith('.net'):
        home_page = info_list[-1]
    else:
        home_page = 'No Homepage'

    # 카페주소
    idx = len(menu_price)
    cafe_addr = info_list[0][idx:].strip()    
    
    price_list.append(menu_price)
    cfadd_list.append(cafe_addr )
    cftel_list.append(cafe_tel  )
    homep_list.append(home_page )
    
    print('Top{} Page : Crawling is completed.'.format(ranking))
else:
    print('-'*50)
    print('크롤링 완료!!!')

Top1 Page : Crawling is completed.
Top2 Page : Crawling is completed.
Top3 Page : Crawling is completed.
Top4 Page : Crawling is completed.
Top5 Page : Crawling is completed.
--------------------------------------------------
크롤링 완료!!!


In [40]:
price_list

['$10.', '$9.', '$9.50', '$9.40', '$10.']

#### # 메뉴가격과 카페주소 정보를 수정보완 

In [41]:
menu_price = price_list[0]
if menu_price.endswith('.'):
    menu_price = menu_price[:-1]
menu_price

'$10'