<a href="https://colab.research.google.com/github/RNEequalRNA/youtube-data-acquisition/blob/main/sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample

In [1]:
import requests
import urllib
import copy
import re
import json
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
API_KEY = 'AIzaSyB2OJynYIkD7nW7ymSGtmkSHp9iMVN1K-M' # API 요청을 위한 키
BASE_URL = 'https://www.googleapis.com/youtube/v3/' # url prefix

In [3]:
# api 엔드포인트에 요청을 보내는 헬퍼 함수
def retrieve_api(url, params={}, data={}, headers={}, method='GET'):
    params = copy.deepcopy(params)
    params.update({
        'key': API_KEY
    })
    
    return requests.request(method, urllib.parse.urljoin(BASE_URL, url), params=params, data=data, headers=headers)

# API에서 가져온 데이터에서 텍스트만 추출
extract_commentThread_text = lambda item: item['snippet']['topLevelComment']['snippet']['textDisplay']
extract_caption_text = lambda item: re.sub('<(.|\n)*?>', '', item.text)

In [4]:
VIDEO_ID = 'zBkVCpbNnkU' # 비디오 ID

In [5]:
# 댓글 가져오기
resp = retrieve_api('commentThreads', params={
    'videoId': VIDEO_ID,
    'part': 'snippet'
})
resp

<Response [200]>

In [6]:
commentData = json.loads(resp.text)

In [14]:
# 텍스트만 추출
commentSeries = pd.Series(map(extract_commentThread_text, commentData['items']))
commentSeries

0     You want to learn more about science? Check ou...
1     The fact that they are paying ppl incentives t...
2     The US is the greatest, most superior country ...
3     <a href="https://www.youtube.com/watch?v=zBkVC...
4     There aren’t enough strawmans or ad hominem at...
5     Guys, you&#39;ve made it! Day 3 of no meat is ...
6     The biggest problem with vaccines is that the ...
7     Ignore rudy jean so her stupidity doesn&#39;t ...
8     Wake me up inside (save me)<br>Call my name an...
9                                   Antivaxers: eureka!
10    Being put on a ventilator is like being put in...
11    There is no argument, there is no debate. I si...
12    1iq anti vaccxers &#39;&#39;fake fake!!&#39;&#...
13    fauci<br> today, a pandemic is too important t...
14    alternate title: Kurzgesagt roasts anti vaxxer...
15    I declare end of war on rudy karen &quot;the s...
16                      Mista- PTSD makes you stronger!
17                              Still relevant i

In [8]:
# 자막 가져오기
resp = requests.get(f'https://video.google.com/timedtext?lang=en&v={VIDEO_ID}')
resp

<Response [200]>

In [9]:
# XML 파싱하여 트리 만들기
captionData = ET.fromstring(resp.text)

In [10]:
# 트리에서 실제 텍스트만 추출
captionSeries = pd.Series(map(extract_caption_text, [*captionData]))
captionSeries

0      Vaccines are celebrated\nfor their part in fig...
1                         But, a growing group of people
2      seem to believe that they endanger\nour health...
3      The Internet is full of stories\nabout allergi...
4                             the onset of disabilities,
                             ...                        
676    to eradicate the monsters that\nmost of us hav...
677                 Let&#39;s not bring the beasts back!
678                 Let&#39;s not bring the beasts back!
679                 Let&#39;s not bring the beasts back!
680                                                    ​
Length: 681, dtype: object

In [11]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [12]:
def get_words(st: str):
    st = re.sub('[^a-zA-Z\ ]', ' ', st) # 공백, a-z, A-Z만 남딤
    result = word_tokenize(st.lower()) # 소문자로 바꾸고, 토큰화
    result = [*filter(lambda x: x not in stop_words, result)] # stop words 제거
    result = [*map(lambda x: lemmatizer.lemmatize(x), result)] # 표제어 추출

    return result

# 간단한 multiset 구현
def counter(dt, vl):
    if vl in dt:
        dt[vl] += 1
    else:
        dt[vl] = 1

In [15]:
# 모든 단어에 빈도수 저장
dt = dict()
for tokenized in map(get_words, captionSeries):
   for word in tokenized:
       counter(dt, word)

# pandas.DataFrame으로 변환
words = []
freq = []
for key in dt:
    words.append(key)
    freq.append(dt[key])

df = pd.DataFrame.from_dict({
    'word': words,
    'freq': freq
})

# 빈도수 내림차순으로 정렬
df.sort_values(by='freq', ascending=False)

Unnamed: 0,word,freq
0,vaccine,108
202,measles,60
193,child,60
28,effect,56
34,immune,52
...,...,...
173,create,4
172,annoy,4
170,lab,4
169,cousin,4
