### 빅뱅이론 데이터 수집 및 분석
* 빅뱅이론(The BigBang Theory)을 검색하여\
    각 회차의 평점, 해당 시즌, 상영 일자를 조회하여 데이터를 분석하자.
* URL을 사용하지 말고, Selenium을 사용한다.


In [None]:
import pandas as pd
import numpy as np
import time
import re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import koreanize_matplotlib

from datetime import datetime
from tqdm.notebook import tqdm
import xlsxwriter


In [None]:
# convert dd/mm/yyyy to yyyy/mm/dd 
def convert_us_date_to_korean(date_str):
    us_date = datetime.strptime(date_str, '%a, %b %d, %Y')
    korean_date = us_date.strftime('%Y.%m.%d(%a)')
    return korean_date

In [None]:
imdb = webdriver.Chrome(service=Service("../driver/chromedriver"))
imdb.maximize_window()
imdb.get("https://www.imdb.com/")

actions = ActionChains(imdb)
imdb.implicitly_wait(3)

search_box = imdb.find_element(By.ID, "suggestion-search")
search_box.send_keys("The Big Bang Theory")
search_box.send_keys(Keys.ENTER)
imdb.find_element(By.CLASS_NAME,"ipc-metadata-list-summary-item__t").click()
click = imdb.find_element(By.CLASS_NAME,"sc-1371769f-1.cTKSOF.episode-guide-text").click()

In [None]:
season_info_list = []
episode_info_list = []
season_class = ["ipc-tabs.ipc-tabs--base.ipc-tabs--align-left.ipc-tabs--display-chip.ipc-tabs--inherit",# season tab box
                "ipc-tab.ipc-tab-link.ipc-tab--on-base",    # seasons
                "sc-f2169d65-4.kDAvos"                      # episode info
                ]
info_class = ["ipc-title__text",                            # title
              "ipc-html-content-inner-div",                 # content
              "sc-f2169d65-10.iZXnmI",                      # release date
              "sc-e2dbc1a3-0.ajrIH.sc-282bae8e-3.bXuGWE"]   # rating
next_page_btn = "ipc-icon.ipc-icon--chevron-right.ipc-btn__icon.ipc-btn__icon--post" # right page btn

head_pattern = r"\w\d\.\w\d"
rate_pattern = r"\((.*?)\)"

season_box = imdb.find_element(By.CLASS_NAME,season_class[0])       # season tab box
seasons = season_box.find_elements(By.CLASS_NAME,season_class[1])   # seasons

for season in tqdm(seasons):
    time.sleep(3)
    episode_list = imdb.find_elements(By.CLASS_NAME,season_class[2])    # episodes

    for episode in episode_list :
        
        title   = episode.find_element(By.CLASS_NAME,info_class[0]).text                    # head: season, episode, title
        content = episode.find_element(By.CLASS_NAME,info_class[1]).text                    # content
        date    = episode.find_element(By.CLASS_NAME,info_class[2]).text                    # aired date
        review  = episode.find_element(By.CLASS_NAME,info_class[3]).text.split(sep="\n")    # review : ratings, votes
        
        if re.search(head_pattern, title) :         # data seperate
            head = re.match(r"S(\d+).E(\d+)", title)
            head_season = head.group(1)
            head_episode = head.group(2)
            title = title.split(sep=" ", maxsplit=2)[2]
        else :                                      # exception
            head_season = 12
            head_episode = 25
        date = convert_us_date_to_korean(date)
        rating  = review[0]; vote = re.search(rate_pattern, review[2]).group(1)

        # merge
        episode_info = [head_season, head_episode, title, rating, vote, date, content]
        episode_info_list.append(episode_info)
    season_info_list.append(episode_info_list)
    
    if season == seasons[-1] : continue         # exception

    next_page = imdb.find_element(By.CLASS_NAME,next_page_btn)
    actions.move_to_element(next_page).perform()
    next_page.click()

imdb.quit()

In [None]:
episode_data = pd.DataFrame(episode_info_list)
episode_data.columns = ["Sea", "Epi", "Title", "Rating", "Vote", "Date", "Content"]
episode_data["Sea"] = episode_data["Sea"].astype(int)
episode_data["Epi"] = episode_data["Epi"].astype(int)
episode_data["Rating"] = episode_data["Rating"].astype(float)
episode_data["Vote"] = episode_data["Vote"].str.replace("K", '').astype(float)
episode_data.set_index("Sea", inplace=True)
episode_data_sorted = episode_data.sort_values(by=["Sea", "Epi"], ascending=True)
episode_data_sorted

In [None]:
writer = pd.ExcelWriter("../data/episode_data.xlsx", engine='xlsxwriter')

episode_data.to_excel(writer, sheet_name='Sheet1')
worksheet = writer.sheets['Sheet1']

centered_format = writer.book.add_format({'align': 'center', 'valign': 'vcenter'})

worksheet.set_column('A:A',4,centered_format)
worksheet.set_column('B:B',4,centered_format)
worksheet.set_column('C:C',25,)
worksheet.set_column('D:D',4,centered_format)
worksheet.set_column('E:E',4,centered_format)
worksheet.set_column('F:F',12,)

writer.close()

In [None]:
plt.figure(figsize=(15,7))

pivot = pd.pivot_table(data=episode_data, index=["Sea"], columns=["Epi"], values=["Rating"])

xticklabels = pivot.columns.get_level_values('Epi').unique().tolist()
yticklabels = pivot.index.unique().tolist()

sns.heatmap(pivot, annot=True, lw=.5, cmap='RdPu', cbar_kws={"label": "Rating"},
            xticklabels=xticklabels, yticklabels=yticklabels)

plt.xlabel('Episode')
plt.ylabel('Season')
plt.title('시즌별 에피소드 평점')
plt.autoscale()
plt.tight_layout(pad=5)
plt.show()

In [None]:
plt.figure(figsize=(15,7))

pivot = pd.pivot_table(data=episode_data, index=["Sea"], columns=["Epi"], values=["Vote"])

xticklabels = pivot.columns.get_level_values('Epi').unique().tolist()
yticklabels = pivot.index.unique().tolist()

sns.heatmap(pivot, annot=True, lw=.5, cmap='RdPu', cbar_kws={"label": "Vote"},
            xticklabels=xticklabels, yticklabels=yticklabels)

plt.xlabel('Episode')
plt.ylabel('Season')
plt.title('시즌별 에피소드 평점')
plt.autoscale()
plt.tight_layout(pad=5)
plt.show()