In [1]:
# requests: http를 이용해서 정보를 주고 받고 싶을 때 사용하는 라이브러리
# $ pip install requests or $ poetry add requests 후 사용
# requests 라이브러리로 웹페이지 접근하기
import requests


url = 'http://www.google.com/'
response = requests.get(url)
response

<Response [200]>

In [2]:
response.url

'http://www.google.com/'

In [3]:
response.text[:200]

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/goo'

In [4]:
response.status_code # 200은 성공이라는 의미, 400(클라 오류), 500(서버 오류)

200

In [6]:
# requests 를 활용한 api 데이터 수집
url = 'https://api.kurly.com/v2/categories?ver=1'
response = requests.get(url)
response

<Response [200]>

In [7]:
type(response.json())

dict

In [9]:
response_dict = response.json()
response_dict['data']['categories'][0]

{'no': '907',
 'name': '채소',
 'show_all_flag': True,
 'pc_icon_url': 'https://img-cf.kurly.com/shop/data/category/icon_veggies_inactive_pc@2x.1586324570.png',
 'icon_url': 'https://img-cf.kurly.com/shop/data/category/icon_veggies_inactive@3x.1586324413.png',
 'pc_icon_active_url': 'https://img-cf.kurly.com/shop/data/category/icon_veggies_active_pc@2x.1586324570.png',
 'icon_active_url': 'https://img-cf.kurly.com/shop/data/category/icon_veggies_active@3x.1586324413.png',
 'categories': [{'no': '907001', 'name': '고구마·감자·당근'},
  {'no': '907002', 'name': '시금치·쌈채소·나물'},
  {'no': '907003', 'name': '브로콜리·파프리카·양배추'},
  {'no': '907005', 'name': '양파·대파·마늘·배추'},
  {'no': '907004', 'name': '오이·호박·고추'},
  {'no': '907007', 'name': '냉동·이색·간편채소'},
  {'no': '907006', 'name': '콩나물·버섯'}]}

In [11]:
# 키 값이 'data'에 'categories' 의 값들 중에서 'name'만 뽑아보기
for item in response_dict['data']['categories']:
    print(item['name'])

채소
과일·견과·쌀
수산·해산·건어물
정육·계란
국·반찬·메인요리
샐러드·간편식
면·양념·오일
생수·음료·우유·커피
간식·과자·떡
베이커리·치즈·델리
건강식품
생활용품·리빙
뷰티·바디케어
주방용품
가전제품
베이비·키즈
반려동물


In [14]:
# api 접근할 때 token이 필요할 경우(ex.상품정보 페이지)
url = 'https://api.kurly.com/v3/home/products/66515?&ver=1613544143256'
response = requests.get(url)
response # 401 == 'unauthorized' 권한이 없음, 서버는 동작하는데 정보 전달 안됨

<Response [401]>

In [15]:
# hearders 부분 활용
from time import sleep


result_dict = {}
for item_no in range(66500, 66520):
    url = 'https://api.kurly.com/v3/home/products/{}?&ver=1613544143256'.format(item_no)
    headers = {'authorization': 'token'}
    response = requests.get(url, headers=headers)
    result_dict[item_no] = response.json()
    sleep(0.5)

In [17]:
for item in result_dict.keys():
    print(result_dict[item]['data']['name'], result_dict[item]['data']['discounted_price'])

KeyError: 'data'

In [18]:
result_dict[66520]

KeyError: 66520

In [19]:
# beautifulsoup 을 활용한 페이지 데이터 수집
# api가 존재하지 않는 사이트의 경우

In [24]:
from bs4 import BeautifulSoup
import lxml  # 속도가 빨라서 파싱용으로 많이 씀


url = 'https://en.wikipedia.org/wiki/Coronavirus_disease_2019'
response =requests.get(url)
response.status_code


200

In [25]:
response.text[:200]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Coronavirus disease 2019 - Wikipedia</title>\n<script>document.documentElement.className="client-js"'

In [27]:
# html문서 중 첫번째 div 붙은거 파싱하기 
html_text = response.text
soup = BeautifulSoup(html_text, 'lxml') # html.parser  
                   # 파싱할 대상, 파싱 도구
toc_div = soup.find('div', attrs = {'id':'toc'})
            # 첫번째거 찾기, 'div' 중에서, 'id' = 'toc' 인 부분 
toc_div
# soup.find() soup에서 파싱할 대상 중 첫번째거 찾기
# soup.find_all() soup에서 파싱할 대상 모두 찾기
# soup.select()
# soup.select_all()


<div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation"><input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/><div class="toctitle" dir="ltr" lang="en"><h2 id="mw-toc-heading">Contents</h2><span class="toctogglespan"><label class="toctogglelabel" for="toctogglecheckbox"></label></span></div>
<ul>
<li class="toclevel-1 tocsection-1"><a href="#Signs_and_symptoms"><span class="tocnumber">1</span> <span class="toctext">Signs and symptoms</span></a></li>
<li class="toclevel-1 tocsection-2"><a href="#Cause"><span class="tocnumber">2</span> <span class="toctext">Cause</span></a>
<ul>
<li class="toclevel-2 tocsection-3"><a href="#Transmission"><span class="tocnumber">2.1</span> <span class="toctext">Transmission</span></a></li>
<li class="toclevel-2 tocsection-4"><a href="#Virology"><span class="tocnumber">2.2</span> <span class="toctext">Virology</span></a></li>
<li class="toclevel-2 tocsection-5"><a href="#SARS-CoV-2_

In [30]:
# 파싱한 것 중에서 li부분 텍스트로 가져오기 
contents_li = toc_div.find_all('li') 
for li in contents_li:
    print(li.text)

1 Signs and symptoms
2 Cause

2.1 Transmission
2.2 Virology
2.3 SARS-CoV-2 variants


2.1 Transmission
2.2 Virology
2.3 SARS-CoV-2 variants
3 Pathophysiology

3.1 Immunopathology
3.2 Viral and host factors

3.2.1 Virus proteins
3.2.2 Host factors


3.3 Host cytokine response


3.1 Immunopathology
3.2 Viral and host factors

3.2.1 Virus proteins
3.2.2 Host factors


3.2.1 Virus proteins
3.2.2 Host factors
3.3 Host cytokine response
4 Diagnosis

4.1 Viral testing
4.2 Imaging
4.3 Coding
4.4 Pathology


4.1 Viral testing
4.2 Imaging
4.3 Coding
4.4 Pathology
5 Prevention

5.1 Vaccine
5.2 Social distancing
5.3 Self-isolation
5.4 Face masks and respiratory hygiene
5.5 Hand-washing and hygiene
5.6 Surface cleaning
5.7 Ventilation and air filtration
5.8 Healthy diet and lifestyle


5.1 Vaccine
5.2 Social distancing
5.3 Self-isolation
5.4 Face masks and respiratory hygiene
5.5 Hand-washing and hygiene
5.6 Surface cleaning
5.7 Ventilation and air filtration
5.8 Healthy diet and lifestyle
6 Treatm