> # CINE21 배우 정보 크롤링

## 라이브러리

In [1]:
import pymongo
from pprint import pprint

from bs4 import BeautifulSoup
import requests
import re

## MongoDB 연결

In [2]:
# 몽고 클라이언트
conn = pymongo.MongoClient()
# cine21 DB 연결
db = conn.cine21
# actor Collection 연결
collection = db.actor

- URL: http://www.cine21.com/rank/person/content
- Method: POST
- Form Data
  - section: actor
  - period_start: 2020-05
  - gender: all
  - page: 1

## 크롤링

### 참고: 특수한 정규 표현식

```html
- Greedy(.*) vs Non-Greedy(.*?)
- 연습: https://regexr.com
- <li><span class="tit">직업</span>배우</li>
```

In [3]:
# 배우 정보 리스트
actors_info_list = list()

# Request 파라미터
cine21_url = 'http://www.cine21.com/rank/person/content'
post_data = dict()
post_data['section'] = 'actor'
post_data['period_start'] = '2019-11'
post_data['gender'] = 'all'

# 페이지 순회 1 ~ 20
for i in range(1, 21):
    
    print(f">> Page: {i} Start")
    post_data['page'] = i

    res = requests.post(cine21_url, data=post_data)

    soup = BeautifulSoup(res.content, 'html.parser')
    actors = soup.select('Li.people_li div.name')
    hits = soup.select('ul.num_info li strong')
    movies = soup.select('ul.mov_list')
    rankings = soup.select('li.people_li span.grade')

    url = 'http://www.cine21.com'

    for index, actor in enumerate(actors):

        # 배우 이름
        actor_nm = re.sub('\(\w*\)','', actor.text)
        # 배우 흥행지수
        actor_hit = int(hits[index].text.replace(',',''))
        # 배우 랭킹
        actor_ranking = rankings[index].text
        # 출연 영화 목록
        movie_titles = movies[index].select('li a span')
        movie_title_list = list()
        for movie_title in movie_titles:
            movie_title_list.append(movie_title.text)

        # 배우 상세 페이지 링크
        actor_link = url + actor.select_one("a").attrs['href']
        # 상세 페이지 요청
        response_actor = requests.get(actor_link)
        # 상세 페이지 파싱
        soup_actor = BeautifulSoup(response_actor.content, 'html.parser')
        default_info = soup_actor.select_one('ul.default_info')
        # 배우 상세 정보
        actor_details = default_info.select('li')

        # 배우 상세 정보 딕셔너리
        actor_info_dict = dict()
        actor_info_dict['이름'] = actor_nm
        actor_info_dict['흥행지수'] = actor_hit
        actor_info_dict['랭킹'] = actor_ranking
        actor_info_dict['출연영화'] = movie_title_list
        for actor_item in actor_details:

            # 필드
            actor_item_field = actor_item.select_one('span.tit').text

            # 값
            actor_item_value = re.sub('<span.*?>.*?</span>','', str(actor_item))
            actor_item_value = re.sub('<.*?>','',actor_item_value)

            # 딕셔너리 추가
            actor_info_dict[actor_item_field] = actor_item_value

        # 리스트 추가
        actors_info_list.append(actor_info_dict)

>> Page: 1 Start
>> Page: 2 Start
>> Page: 3 Start
>> Page: 4 Start
>> Page: 5 Start
>> Page: 6 Start
>> Page: 7 Start
>> Page: 8 Start
>> Page: 9 Start
>> Page: 10 Start
>> Page: 11 Start
>> Page: 12 Start
>> Page: 13 Start
>> Page: 14 Start
>> Page: 15 Start
>> Page: 16 Start
>> Page: 17 Start
>> Page: 18 Start
>> Page: 19 Start
>> Page: 20 Start


## MongoDB에 데이터 적재

In [4]:
collection.insert_many(actors_info_list)

<pymongo.results.InsertManyResult at 0x20713edf740>

## 적재된 Documents 수 확인

In [5]:
collection.count_documents({})

140