In [71]:
import math
import re
import time
from tqdm import trange, tqdm
import json
import re
import requests
import sys, os
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from pprint import pprint

from selenium import webdriver



normalize_pattern = re.compile('[\r\n\t]')
doublespcae_pattern = re.compile('[\s]+')

## 영화 idx 크롤링

In [11]:
current_dir = os.getcwd()
df = pd.read_csv(current_dir+"/data/original_title.csv")

In [35]:
df.head()

Unnamed: 0,id,title,type,original_title
0,0,모가디슈,Movie,
1,1,베놈,Movie,Venom
2,2,스파이더맨,Movie,Spider-Man
3,3,스파이더맨: 뉴 유니버스,Movie,Spider-Man: Into the Spider-Verse
4,4,스파이더맨: 홈커밍,Movie,Spider-Man: Homecoming


In [None]:
titles = list(df['title'].values)

In [54]:
idx = []
driver = webdriver.Chrome(current_dir+"/driver/chromedriver")

for title in titles:
    url = "https://movie.naver.com/movie/search/result.naver?section=movie&query={}".format(title)
    driver.get(url)
    driver.implicitly_wait(3)
    try:
        a = driver.find_element_by_css_selector(".search_list_1 li dl dt a")

        # url 에서 idx 뽑기
        url = a.get_attribute('href')
        idx.append(url.split('=')[-1])
        
    except:
        idx.append(np.nan)
driver.close()

In [56]:
len(idx)

199

In [57]:
df['idx'] = idx

In [64]:
na_idx = df[df['original_title'] == ' '].index
df.loc[na_idx, 'original_title'] = np.nan

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              199 non-null    int64 
 1   title           199 non-null    object
 2   type            199 non-null    object
 3   original_title  134 non-null    object
 4   idx             179 non-null    object
dtypes: int64(1), object(4)
memory usage: 7.9+ KB


In [67]:
df.to_csv(current_dir+'/data/original_title.csv')

### 영화 idx -> 영어 제목 크롤링

In [5]:
def text_normalize(s):
    """
    Arguments
    ---------
    s : str
        Text to normalize
    Returns
    -------
    normalized text. Remove \\n, \\r, \\t, double space
    """

    s = s.replace('&nbsp;', ' ')
    s = s.replace('\xa0', ' ')
    s = normalize_pattern.sub(' ', s)
    s = doublespcae_pattern.sub(' ', s)
    return s.strip()

In [6]:

basic_url_form = 'http://movie.naver.com/movie/bi/mi/basic.nhn?code={}' # idx

def scrap_basic(idx):
    url = basic_url_form.format(idx)
    soup = get_soup(url)
    infomation = {
        'movie_idx': idx,
        'title': title(soup),
        'e_title': e_title(soup)
    }
    return infomation

def title(soup):
    a = soup.select('div[class=mv_info] h3[class=h_movie] a')
    if not a:
        return ''
    return text_normalize(a[0].text)

def e_title(soup):
    strong = soup.select('div[class=mv_info] strong[class=h_movie2]')
    if not strong:
        return ''
    return text_normalize(strong[0].text)

## 평점 크롤러

In [100]:
# idx, type, page
# 공감순
comments_url_form = 'https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code={}&type=after&onlyActualPointYn=N&onlySpoilerPointYn=N&order=sympathyScore&page={}'

def get_soup(url, headers=None, allow_redirects=True):
    """
    Argument
    --------
    url : str
        Web page url
    headers : dict or None
        Headers
    allow_redirects : Boolean
    Returns
    -------
    bs4.Beautifulsoup format HTML page
    """

    try:
        r = requests.get(url, headers=headers, allow_redirects=allow_redirects).text
        return BeautifulSoup(r, 'lxml')
    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback_details = {
            'filename': exc_traceback.tb_frame.f_code.co_filename,
            'lineno'  : exc_traceback.tb_lineno,
            'name'    : exc_traceback.tb_frame.f_code.co_name,
            'type'    : exc_type.__name__,
            'message' : str(e)
        }
        pprint(traceback_details)
        return ''

def scrap_comments(idx, limit=-1, sleep=0.05, last_time=None, i_movie=-1, n_total_movies=-1):
    max_page = num_of_comment_pages(idx)
    if limit > 0:
        max_page = min(limit, max_page)
    if max_page <= 0:
        return []

    if n_total_movies < 0 or i_movie < 0:
        desc = f'Scrap comments {idx}'
    else:
        desc = f'Scrap comments {idx} ({i_movie}/{n_total_movies})'

    comments = []
    for p in trange(1, max_page + 1, desc=desc):
        url = comments_url_form.format(idx, p)
        comments_p, stop = parse_a_page(get_soup(url), last_time)
        comments += comments_p
        if stop:
            print(f'\r  movie {idx}. stop scrap comments. found existing comments {p} / {max_page}')
            break
    return comments[::-1]

def parse_a_page(soup, last_time=None):
    comments = []
    stop = False
    for row in soup.select('div[class=score_result] li'):
        try:
            score = int(row.select('div[class=star_score] em')[0].text.strip())
            text = row.select('div[class=score_reple] p')[0].text.strip()
            # detach '관람객' icon
            if text[:4] == '관람객\n':
                text = text[4:].strip()
            # detach '스포일러' icon
            if text[:25] == '스포일러가 포함된 감상평입니다. 감상평 보기\n':
                text = text[25:].strip()

            comments.append(
                {'score': score,
                 'text': text
                })
        except Exception as e:
            continue
    return comments, stop

def num_of_comment_pages(idx):
    url = comments_url_form.format(idx, 1)
    soup = get_soup(url)

    try:
        num_comments = int(soup.select('div[class="score_total"] em')[-1].text.replace(',',''))
        return math.ceil(num_comments / 5)
    except Exception as e:
        return -1

In [70]:
df['idx'][0]

'192150'

In [77]:
df.head()

Unnamed: 0,id,title,type,original_title,idx
0,0,모가디슈,Movie,,192150
1,1,베놈,Movie,Venom,187323
2,2,스파이더맨,Movie,Spider-Man,208077
3,3,스파이더맨: 뉴 유니버스,Movie,Spider-Man: Into the Spider-Verse,171725
4,4,스파이더맨: 홈커밍,Movie,Spider-Man: Homecoming,135874


In [75]:
scrap_reviews = scrap_comments(192150, limit=10)

Scrap comments 192150: 100%|██████████| 10/10 [00:00<00:00, 12.72it/s]


In [101]:
merge_df = pd.DataFrame(columns=['movie_idx', 'score', 'text'])

In [102]:
for id in tqdm(df['idx']):
    scrap_reviews = pd.DataFrame(scrap_comments(id, limit=100))
    scrap_reviews['movie_idx'] = id
    merge_df = pd.concat([merge_df, scrap_reviews], axis=0)

Scrap comments 192150: 100%|██████████| 100/100 [00:08<00:00, 12.03it/s]
Scrap comments 187323: 100%|██████████| 100/100 [00:07<00:00, 13.00it/s]
Scrap comments 208077: 100%|██████████| 100/100 [00:08<00:00, 12.15it/s]
Scrap comments 171725: 100%|██████████| 100/100 [00:07<00:00, 12.95it/s]
Scrap comments 135874: 100%|██████████| 100/100 [00:08<00:00, 12.16it/s]
Scrap comments 193857: 100%|██████████| 45/45 [00:03<00:00, 14.55it/s]
Scrap comments 192614: 100%|██████████| 100/100 [00:07<00:00, 12.74it/s]
Scrap comments 173123: 100%|██████████| 100/100 [00:07<00:00, 12.51it/s]
Scrap comments 66823: 100%|██████████| 100/100 [00:07<00:00, 12.82it/s]
Scrap comments 188790: 100%|██████████| 100/100 [00:07<00:00, 13.47it/s]
Scrap comments 185614: 100%|██████████| 100/100 [00:07<00:00, 13.17it/s]
Scrap comments 35071: 100%|██████████| 100/100 [00:07<00:00, 13.02it/s]
Scrap comments 191917: 100%|██████████| 45/45 [00:03<00:00, 14.29it/s]
Scrap comments 94187: 100%|██████████| 100/100 [00:07<00:

In [104]:
merge_df.reset_index(drop=True, inplace=True)

In [105]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106943 entries, 0 to 106942
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   movie_idx  106943 non-null  object
 1   score      106943 non-null  object
 2   text       106943 non-null  object
dtypes: object(3)
memory usage: 2.4+ MB


In [106]:
len(merge_df['movie_idx'].unique())

127

In [122]:
merge_df[merge_df['movie_idx']=='188343']

Unnamed: 0,movie_idx,score,text


In [114]:
merge_df.shape

(106943, 3)

In [115]:
112841-106943

5898

In [116]:
result_df = pd.merge(df, merge_df, left_on='idx', right_on='movie_idx', how='outer')

In [124]:
len(result_df[result_df['movie_idx'].isna()].title.unique())

68

In [121]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112841 entries, 0 to 112840
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              112841 non-null  int64 
 1   title           112841 non-null  object
 2   type            112841 non-null  object
 3   original_title  83489 non-null   object
 4   idx             112821 non-null  object
 5   movie_idx       112773 non-null  object
 6   score           112773 non-null  object
 7   text            112773 non-null  object
dtypes: int64(1), object(7)
memory usage: 7.7+ MB


In [120]:
df[df['idx'].isna()].shape

(20, 5)

In [125]:
result_df.to_csv(current_dir+'/data/naver_movie_reviews.csv', index=False)