## Collect the data from Youtube

### Scraping music video list & number of views

With selenium and firefox,
 * Scrap kpop music videos on youtube
 * Scrap # of views for certain video

#### import prerequisites

In [1]:
from selenium.webdriver import Firefox
from datetime import datetime

import pandas as pd
import random
import time
import os

In [2]:
# Initialize browser
browser = Firefox()

#### Collect video list & detail

In [3]:
def collect_videos(browser):    
    """Returns collected video list from youtube"""
    
    print(f'collecting start - {datetime.now()}')
    url = 'https://www.youtube.com/results?search_query=kpop+music+video&sp=CAMSBggEEAEYAQ%253D%253D'
    browser.get(url)
    time.sleep(2)

    # scroll down several times, to get more videos
    
    for _ in range(7):
        browser.execute_script("window.scrollTo(0, window.scrollY + 200000)")
        time.sleep(2)

    # Get video title & URL
    videolinks = browser.find_elements_by_id('video-title')
    videos = [(link.text, link.get_attribute('href')) for link in videolinks]

    # Store to 'videos' collection
    i = 0
    result = []
    
    for title, url in videos:
        i +=1
        video_id = url.split('=')[-1]
        video = {
            'video_id': video_id,
            'title': title,
            'url': url
        }
        result.append(video)

    print(f'{i} videos were collected. - {datetime.now()}')
    return result


In [4]:
videos = collect_videos(browser)

collecting start - 2019-07-09 12:15:52.415103
156 videos were collected. - 2019-07-09 12:16:13.039244


In [5]:
videos[:3]

[{'video_id': 'ZdKYi5ekshM',
  'title': 'TWICE「Breakthrough」Music Video',
  'url': 'https://www.youtube.com/watch?v=ZdKYi5ekshM'},
 {'video_id': '3n9rDwpa6QA',
  'title': 'TWICE「HAPPY HAPPY」Music Video',
  'url': 'https://www.youtube.com/watch?v=3n9rDwpa6QA'},
 {'video_id': 'uAjn3-c9boc',
  'title': 'BANNED KPOP Music Videos for Stupid Reasons',
  'url': 'https://www.youtube.com/watch?v=uAjn3-c9boc'}]

In [6]:
# Extract just urls and get more detail about it
urls = [video['url'] for video in videos]
urls[:3]

['https://www.youtube.com/watch?v=ZdKYi5ekshM',
 'https://www.youtube.com/watch?v=3n9rDwpa6QA',
 'https://www.youtube.com/watch?v=uAjn3-c9boc']

In [7]:
# Or load from text file
df = pd.read_csv('./data/new_0709.txt', header=None)
df.columns=['URL']

In [8]:
urls = df['URL'].unique()
len(urls)

4

In [9]:
def collect_video_detail(browser, urls, start_point=0):    
    """Returns collectd video detail from youtube"""
    i = 0
    result = []
    print(f'collecting start - {datetime.now()}')

    for url in urls:
        # if some skip needed
        # set start_point in the list
        if (i < start_point):
            i+=1
            continue
        
        try:            
            # open the web page
            browser.get(url)
            time.sleep(3)

            browser.execute_script("window.scrollTo(0, window.scrollY + 720)")
            time.sleep(3)

            # parsing the values
            title = browser.find_element_by_css_selector('h1.title').text
            published = browser.find_element_by_css_selector('span.date').text
            video_id = url.split('=')[-1]

            video_info = {
                'video_id': video_id,
                'published': published,
                'title': title,
                'url': url
            }

            result.append(video_info)
            print(f'{datetime.now()} {i} - {title} has been processed.')
            i += 1
        except:
            print(f'some errors happens for {url}')
        
    return result


In [10]:
video_detail = collect_video_detail(browser, urls)

collecting start - 2019-07-09 12:16:13.132644
2019-07-09 12:16:21.195089 0 - [MV] HA SUNG WOON(하성운) _ BLUE has been processed.
2019-07-09 12:16:29.534383 1 - [MV] MeloMance(멜로망스) _ You&I(인사) has been processed.
2019-07-09 12:16:37.823031 2 - [MV] WH3N(웬) _ I've Been Thinking About It A Lot(무엇이 그토록 그대를) (Short Film) has been processed.
2019-07-09 12:16:45.862612 3 - [MV] Stella Jang(스텔라장) _ YOLO has been processed.


In [11]:
video_detail[:3]

[{'video_id': 'Wiq0J7_jHI8',
  'published': 'Published on Jul 8, 2019',
  'title': '[MV] HA SUNG WOON(하성운) _ BLUE',
  'url': 'https://www.youtube.com/watch?v=Wiq0J7_jHI8'},
 {'video_id': '-zVL1JRDYxg',
  'published': 'Published on Jul 9, 2019',
  'title': '[MV] MeloMance(멜로망스) _ You&I(인사)',
  'url': 'https://www.youtube.com/watch?v=-zVL1JRDYxg'},
 {'video_id': 'dr-L5QLewIk',
  'published': 'Published on Jul 7, 2019',
  'title': "[MV] WH3N(웬) _ I've Been Thinking About It A Lot(무엇이 그토록 그대를) (Short Film)",
  'url': 'https://www.youtube.com/watch?v=dr-L5QLewIk'}]

#### Get detail infomation for video

In [12]:
# select a video
url = urls[0]

#url = 'https://www.youtube.com/watch?v=kZOZ5nJ5lI0'

In [13]:
# open YouTube page
browser.get(url)
time.sleep(1)
browser.execute_script("window.scrollTo(0, window.scrollY + 320)")
time.sleep(2)

In [14]:
# published date
browser.find_element_by_css_selector('span.date').text

'Published on Jul 8, 2019'

In [15]:
# number of comments
browser.find_element_by_class_name('count-text').text

'3,189 Comments'

In [16]:
# number of likes
selector = 'ytd-toggle-button-renderer.style-text[is-icon-button]'
selector += ' #text.ytd-toggle-button-renderer'

ele = browser.find_element_by_css_selector(selector)

In [17]:
ele.get_attribute('aria-label')

'71,121 likes'

#### Get comments for video

In [18]:
def get_comments(browser, url):
    """Return comments list for video(url)"""

    print(f'collecting start - {datetime.now()}')

    browser.get(url)
    time.sleep(2)

    for _ in range(20):
        browser.execute_script("window.scrollTo(0, window.scrollY + 720)")
        time.sleep(1)

    comments = browser.find_elements_by_id('content-text')
    video_id = url.split('=')[-1]

    results = []
    for comment in comments:
        text = comment.text
        record = {
            'video_id': video_id,
            'comment': text }
        results.append(record)
    
    print(f'{datetime.now()} - {video_id} : {len(comments)} collected')
    return results

In [19]:
# collect comments
comments = get_comments(browser, url)

collecting start - 2019-07-09 12:16:51.256697
2019-07-09 12:17:17.541787 - Wiq0J7_jHI8 : 120 collected


In [20]:
comments[:3]

[{'video_id': 'Wiq0J7_jHI8',
  'comment': "🔈1theK가 제작한 '1theK Originals-원더케이 오리지널' 채널이 오픈되었습니다:) 많은 관심과 구독 부탁드려요😉\n\n🔈NEW YouTube Channel '1theK Originals' has been launched! Please take a lot of interest and SUBSCRIBE!\n\n\nSubscribe👉 https://www.youtube.com/channel/UCqq-ovGE01ErlXakPihhKDA?sub_confirmation=1"},
 {'video_id': 'Wiq0J7_jHI8',
  'comment': 'I love that Sungwoon can make music THAT HE LOVES while earning money that he DESERVES!'},
 {'video_id': 'Wiq0J7_jHI8',
  'comment': 'HOW MANY INTERNATIONAL FANS ARE HERE TO ALWAYS SUPPORT HA SUNGWWON???\n\n |\n |\n\\/'}]

#### Get number of views

In [21]:
def get_view_count(browser, url):
    """Return the view_count and timestamp"""
    browser.get(url)
    time.sleep(4)
    now = datetime.now()
    try:
        sel = 'span.view-count'
        view_count = browser.find_element_by_css_selector(sel).text
    except:
        print('error in get view_count')
        view_count = '0'

    return {
        'view_count': int(''.join([n for n in view_count if n.isdigit()])),
        'timestamp': now }

In [22]:
# shuffle the order of videos
random.shuffle(urls) 

In [23]:
urls[0]

'https://www.youtube.com/watch?v=dr-L5QLewIk'

In [24]:
# Collect number of views
result = []
while True:
    for url in urls:
        print(f'{datetime.now()} - {url}')
        video_id = url.split('=')[-1]
        count = get_view_count(browser, url)
        view_count = {
            'video_id': video_id,
            'view_count': count['view_count'],
            'timestamp': count['timestamp']
        }
        result.append(view_count)
#    time.sleep(600)   # uncomment if you want to run forever
    break              # comment if you want to run forever
print(f'{datetime.now()} - finished.')

2019-07-09 12:17:17.614647 - https://www.youtube.com/watch?v=dr-L5QLewIk
2019-07-09 12:17:23.333358 - https://www.youtube.com/watch?v=I02VtSDmF18
2019-07-09 12:17:29.285318 - https://www.youtube.com/watch?v=-zVL1JRDYxg
2019-07-09 12:17:34.984842 - https://www.youtube.com/watch?v=Wiq0J7_jHI8
2019-07-09 12:17:40.527328 - finished.


In [25]:
result[:3]

[{'video_id': 'dr-L5QLewIk',
  'view_count': 14860,
  'timestamp': datetime.datetime(2019, 7, 9, 12, 17, 23, 314596)},
 {'video_id': 'I02VtSDmF18',
  'view_count': 36842,
  'timestamp': datetime.datetime(2019, 7, 9, 12, 17, 29, 236563)},
 {'video_id': '-zVL1JRDYxg',
  'view_count': 47336,
  'timestamp': datetime.datetime(2019, 7, 9, 12, 17, 34, 820920)}]

In [26]:
browser.close()

### Done !!!