<a href="https://colab.research.google.com/github/RNEequalRNA/youtube-data-acquisition/blob/main/sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample

In [1]:
import requests
import urllib
import copy
import re
import json
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
API_KEY = 'AIzaSyB2OJynYIkD7nW7ymSGtmkSHp9iMVN1K-M' # API 요청을 위한 키
BASE_URL = 'https://www.googleapis.com/youtube/v3/' # url prefix

In [3]:
# api 엔드포인트에 요청을 보내는 헬퍼 함수
def retrieve_api(url, params={}, data={}, headers={}, method='GET'):
    params = copy.deepcopy(params)
    params.update({
        'key': API_KEY
    })
    
    return requests.request(method, urllib.parse.urljoin(BASE_URL, url), params=params, data=data, headers=headers)

# API에서 가져온 데이터에서 텍스트만 추출
extract_commentThread_text = lambda item: item['snippet']['topLevelComment']['snippet']['textDisplay']
extract_caption_text = lambda item: re.sub('<(.|\n)*?>', '', item.text)

In [4]:
VIDEO_ID = 'y-7UG0jORoA' # 비디오 ID

In [5]:
# 댓글 가져오기
pageToken = None
comments = []

while True:
  resp = retrieve_api('commentThreads', params={
      'videoId': VIDEO_ID,
      'part': 'snippet',
      'order': 'relevance',
      'pageToken': pageToken,
      'textFormat': 'plainText'
  })

  if not resp.ok:
    break

  commentData = json.loads(resp.text)
  
  if not "nextPageToken" in commentData:
    break

  pageToken = commentData["nextPageToken"]
  comments += [*map(extract_commentThread_text, commentData['items'])]

  print(len(comments), end=' ')

20 39 59 79 99 119 139 159 179 199 219 239 259 277 297 317 337 357 377 397 417 437 

In [7]:
commentSeries = pd.Series(comments)

In [8]:
# 자막 가져오기
resp = requests.get(f'https://video.google.com/timedtext?lang=en&v={VIDEO_ID}')
resp

<Response [200]>

In [9]:
# XML 파싱하여 트리 만들기
captionData = ET.fromstring(resp.text)

ParseError: no element found: line 1, column 0 (<string>)

In [None]:
# 트리에서 실제 텍스트만 추출
captions = []
for text in map(extract_caption_text, [*captionData]):
    if len(captions) == 0 or captions[-1] != text:
        captions.append(text)

captionSeries = pd.Series(captions)
captionSeries

0      Vaccines are celebrated\nfor their part in fig...
1                         But, a growing group of people
2      seem to believe that they endanger\nour health...
3      The Internet is full of stories\nabout allergi...
4                             the onset of disabilities,
                             ...                        
336    And, the biggest side effect\nof vaccines is f...
337    Vaccines are one of\nthe most powerful tools w...
338    to eradicate the monsters that\nmost of us hav...
339                 Let&#39;s not bring the beasts back!
340                                                    ​
Length: 341, dtype: object

In [10]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/hamerin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hamerin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/hamerin/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [11]:
def get_words(st: str):
    st = re.sub('[^a-zA-Z\ ]', '', st) # 공백, a-z, A-Z만 남딤
    result = word_tokenize(st.lower()) # 소문자로 바꾸고, 토큰화
    result = [*filter(lambda x: x not in stop_words, result)] # stop words 제거
    result = [*map(lambda x: lemmatizer.lemmatize(x), result)] # 표제어 추출

    return result

# 간단한 multiset 구현
def counter(dt, vl):
    if vl in dt:
        dt[vl] += 1
    else:
        dt[vl] = 1

In [15]:
# 모든 단어에 빈도수 저장
dt = dict()
for tokenized in map(get_words, commentSeries):
   for word in tokenized:
       counter(dt, word)

# pandas.DataFrame으로 변환
words = []
freq = []
for key in dt:
    words.append(key)
    freq.append(dt[key])

df = pd.DataFrame.from_dict({
    'word': words,
    'freq': freq
})

# 빈도수 내림차순으로 정렬
with pd.option_context('display.max_rows', None):
  display(df.sort_values(by='freq', ascending=False))

Unnamed: 0,word,freq
56,child,75
11,country,55
126,biden,55
381,trump,46
30,u,46
112,america,44
240,people,33
302,one,33
185,come,33
70,bbc,30
