In [49]:
import requests
import urllib
import copy
import re
import json
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

In [50]:
API_KEY = 'AIzaSyB2OJynYIkD7nW7ymSGtmkSHp9iMVN1K-M' # API 요청을 위한 키
BASE_URL = 'https://www.googleapis.com/youtube/v3/' # url prefix

In [51]:
# api 엔드포인트에 요청을 보내는 헬퍼 함수
def retrieve_api(url, params={}, data={}, headers={}, method='GET'):
    params = copy.deepcopy(params)
    params.update({
        'key': API_KEY
    })
    
    return requests.request(method, urllib.parse.urljoin(BASE_URL, url), params=params, data=data, headers=headers)


In [52]:
extract_commentThread_text = lambda item: item['snippet']['topLevelComment']['snippet']['textDisplay']

def get_comments(videoId):
    pageToken = None
    result = []

    while True:
        resp = retrieve_api('commentThreads', params={
            'videoId': videoId,
            'part': 'snippet',
            'order': 'relevance',
            'pageToken': pageToken,
            'textFormat': 'plainText'
        })
        assert(resp.ok)
        
        commentData = json.loads(resp.text)

        if not "nextPageToken" in commentData:
            break
        pageToken = commentData["nextPageToken"]
        
        result += [*map(extract_commentThread_text, commentData['items'])]

        print(len(result), end=' ')
    
    return result

In [53]:
extract_caption_text = lambda item: re.sub('<(.|\n)*?>', '', item.text)

def get_captions(videoId):
    resp = requests.get(f'https://video.google.com/timedtext?lang=en&v={VIDEO_ID}')
    assert(resp.ok)

    captionData = ET.fromstring(resp.text)
    captions = []
    for text in map(extract_caption_text, [*captionData]):
        if len(captions) == 0 or captions[-1] != text:
            captions.append(text)

    return captions

In [54]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/hamerin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hamerin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hamerin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [55]:
def get_words(st: str):
    st = re.sub('[^a-zA-Z\ ]', '', st) # 공백, a-z, A-Z만 남딤
    result = word_tokenize(st.lower()) # 소문자로 바꾸고, 토큰화
    result = [*filter(lambda x: x not in stop_words, result)] # stop words 제거
    result = [*map(lambda x: lemmatizer.lemmatize(x), result)] # 표제어 추출

    return result

# 간단한 multiset 구현
def counter(dt, vl):
    if vl in dt:
        dt[vl] += 1
    else:
        dt[vl] = 1

In [56]:
def get_freq(data):
    # 모든 단어에 빈도수 저장
    dt = dict()
    for tokenized in map(get_words, data):
        for word in tokenized:
            counter(dt, word)

    # pandas.DataFrame으로 변환
    word = []
    freq = []
    for key in dt:
        word.append(key)
        freq.append(dt[key])

    df = pd.DataFrame.from_dict({
        'word': word,
        'freq': freq
    })

    # 빈도수 내림차순으로 정렬
    return df.sort_values(by='freq', ascending=False)

In [57]:
comments = get_comments('y-7UG0jORoA')

20 40 60 79 99 119 139 159 179 199 219 239 259 279 298 317 337 357 377 397 417 437 

In [58]:
with pd.option_context('display.max_rows', None):
    display(get_freq(comments))

Unnamed: 0,word,freq
92,child,75
218,biden,60
11,country,55
43,trump,48
29,u,46
102,america,43
59,people,33
359,one,32
145,come,32
204,like,30
