## Collect the data from Youtube

### Scrap music video list & number of views

With selenium and firefox,
 * Scrap kpop music videos on youtube
 * Scrap # of views for certain video

#### import prerequisites

In [1]:
from selenium.webdriver import Firefox
from pymongo import MongoClient 
from datetime import datetime

import pandas as pd
import random
import time
import os

#### connect to mongodb set up on the cloud

with SSH tunnel through 47017 port

In [2]:
connection = MongoClient(port=47017)
db = connection['youtube_scrap']

In [3]:
# Initialize browser
browser = Firefox()

#### Collect video list

In [13]:
def collect_videos(browser, db):    
    """Collect video list from youtube and save to videos collection"""
    url = 'https://www.youtube.com/results?search_query=kpop+music+video&sp=CAMSBggEEAEYAQ%253D%253D'
    browser.get(url)
    time.sleep(2)

    for _ in range(10):
        browser.execute_script("window.scrollTo(0, window.scrollY + 200000)")
        time.sleep(3)

    # Get video title & URL
    videolinks = browser.find_elements_by_id('video-title')
    videos = [(link.text, link.get_attribute('href')) for link in videolinks]
    
    # Store to 'videos' collection
    i = 0
    coll = db['videos']
    for title, url in videos:
        i +=1
        video_id = url.split('=')[-1]
        coll.insert_one({
            'video_id': video_id,
            'title': title,
            'url': url
        })

    print(f'{i} videos were collected.')
    

In [4]:
df = pd.read_csv('./video_list_0627.txt', header=None)
df.columns=['URL']

In [5]:
urls = df['URL'].unique()
len(urls)

90

In [6]:
def collect_video_detail(browser, db, urls, start_point=0):    
    """Collect video detail from youtube and save to videos collection"""    

    i = 0
    for url in urls:
        if (i < start_point):
            i+=1
            continue
            
        browser.get(url)
        time.sleep(2)

        browser.execute_script("window.scrollTo(0, window.scrollY + 720)")
        time.sleep(2)

        title = browser.find_element_by_css_selector('h1.title').text
        published = browser.find_element_by_css_selector('span.date').text

        video_id = url.split('=')[-1]
        
        coll = db['video_detail']
        coll.insert_one({
            'video_id': video_id,
            'published': published,
            'title': title,
            'url': url
        })
        
        print(f'{datetime.now()} {i} - {title} has been processed.')
        i += 1


In [7]:
collect_video_detail(browser, db, urls)

2019-06-27 11:14:53.588305 0 - IZ*ONE (아이즈원) - 라비앙로즈 (La Vie en Rose) MV has been processed.
2019-06-27 11:14:59.833460 1 - 헤이즈 (Heize) - SHE'S FINE MV has been processed.
2019-06-27 11:15:05.852989 2 - VAV - 'THRILLA KILLA' Music Video has been processed.
2019-06-27 11:15:11.929031 3 - [MV] 이기광(LEE GIKWANG) - Don't Close Your Eyes (D.C.Y.E) (Feat. Kid Milli) has been processed.
2019-06-27 11:15:17.802685 4 - [MV] Gummy(거미) _ Alone(혼자) has been processed.
2019-06-27 11:15:23.579124 5 - Yerin Baek "Maybe It's Not Our Fault(그건 아마 우리의 잘못은 아닐 거야)" M/V has been processed.
2019-06-27 11:15:29.472796 6 - EVERGLOW (에버글로우) - 봉봉쇼콜라 (Bon Bon Chocolat) MV has been processed.
2019-06-27 11:15:35.319619 7 - [MV] 415(사이로) _ Take me there has been processed.
2019-06-27 11:15:41.123431 8 - 로꼬 (Loco) - 'NOTHING' Official Music Video (ENG/CHN) has been processed.
2019-06-27 11:15:47.004082 9 - [MV] 100%(백퍼센트) _ Still Loving You has been processed.
2019-06-27 11:15:53.193569 10 - [MV] MAMAMOO(마마무) _ gogob

2019-06-27 11:25:29.140290 89 - 청하 (CHUNG HA) - "벌써 12시 (Gotta Go)" Music Video has been processed.


In [43]:
print(title, published, view_count)

TWICE "FANCY" M/V Published on Apr 22, 2019 136641354


In [45]:
browser.find_element_by_id('content-text').text

'Real ONCE won’t stop streaming after 100M 😎'

In [14]:
collect_videos(browser, db)

217 videos were collected.


#### Get more infomation for video

In [55]:
url = 'https://www.youtube.com/watch?v=FIInyEWWW-s'

In [46]:
url = 'https://www.youtube.com/watch?v=kZOZ5nJ5lI0'

In [65]:
browser.get(url)
time.sleep(1)
browser.execute_script("window.scrollTo(0, window.scrollY + 320)")
time.sleep(2)

In [66]:
# published date
browser.find_element_by_css_selector('span.date').text

'Published on Jun 10, 2019'

In [67]:
# number of comments
browser.find_element_by_class_name('count-text').text

'21,467 Comments'

In [68]:
# number of likes
ele = browser.find_element_by_css_selector('ytd-toggle-button-renderer.style-text[is-icon-button] #text.ytd-toggle-button-renderer')

In [69]:
ele.get_attribute('aria-label')

'290,421 likes'

#### Get comments for video

In [81]:
def get_comments(browser, db, url):
    """Get comments for video(url), and store it to database"""
    browser.get(url)
    time.sleep(2)

    for _ in range(20):
        browser.execute_script("window.scrollTo(0, window.scrollY + 720)")
        time.sleep(1)

    comments = browser.find_elements_by_id('content-text')
    video_id = url.split('=')[-1]

    for comment in comments:
        text = comment.text
        coll = db['comments']
        coll.insert_one({
            'video_id': video_id,
            'comment': text
        })
    
    print(f'{datetime.now()} - {video_id} : {len(comments)} stored.')

In [82]:
# save comments to database
for url in urls:
    get_comments(browser, db, url)

2019-06-26 11:38:42.617385 - YBnGBb1wg98 : 100 stored.
2019-06-26 11:39:12.316967 - 2S24-y0Ij3Y : 100 stored.
2019-06-26 11:39:42.390209 - XsX3ATc3FbA : 100 stored.
2019-06-26 11:40:11.913858 - 5rPluw_-Eb4 : 104 stored.
2019-06-26 11:40:40.368089 - oDJ4ct59NC4 : 100 stored.
2019-06-26 11:41:10.322508 - kOHB85vDuow : 100 stored.
2019-06-26 11:41:40.044269 - K1scjjbfNsk : 100 stored.
2019-06-26 11:42:08.817342 - b73BI9eUkjM : 100 stored.
2019-06-26 11:42:38.690764 - pSudEWBAYRE : 100 stored.
2019-06-26 11:43:08.195759 - HwT9oKqfUxY : 100 stored.
2019-06-26 11:43:37.477980 - GQqyCeKf8rw : 100 stored.
2019-06-26 11:44:06.861535 - M46FRJsB0Qw : 100 stored.
2019-06-26 11:44:35.454319 - pNfTK39k55U : 100 stored.
2019-06-26 11:45:05.516296 - mAKsZ26SabQ : 100 stored.
2019-06-26 11:45:34.441049 - ESVsbCkFvG4 : 100 stored.
2019-06-26 11:46:05.734792 - Dab4EENTW5I : 120 stored.
2019-06-26 11:46:34.632684 - RyVS7R9PN6U : 100 stored.
2019-06-26 11:47:03.408192 - Fm5iP0S1z9w : 100 stored.
2019-06-26

2019-06-26 12:51:34.142215 - JrOrlhjIYVk : 120 stored.
2019-06-26 12:52:03.689354 - mRwxB2AW5HA : 120 stored.
2019-06-26 12:52:30.016911 - QAXa3zpmoLY : 65 stored.
2019-06-26 12:52:58.628354 - Z7bsJf5lPT0 : 100 stored.
2019-06-26 12:53:29.098516 - K6gEaJTHN4M : 140 stored.
2019-06-26 12:53:59.594770 - Z3W0jKcv1SU : 120 stored.
2019-06-26 12:54:29.663619 - l5i8YoFBvO4 : 140 stored.
2019-06-26 12:54:58.730775 - 2EVu6tVZUgc : 100 stored.
2019-06-26 12:55:29.026677 - EzlGdWpVoRM : 120 stored.
2019-06-26 12:55:59.747659 - T7IXKQzmpK4 : 120 stored.
2019-06-26 12:56:30.717990 - m7Ct6cR_NS4 : 120 stored.
2019-06-26 12:57:01.144823 - ALz4b6x0A70 : 120 stored.
2019-06-26 12:57:30.107370 - 4VRrQf_BSxQ : 100 stored.
2019-06-26 12:57:55.063981 - HAdw1e728Js : 26 stored.
2019-06-26 12:58:24.808799 - GrBu7a4gfX8 : 120 stored.
2019-06-26 12:58:52.945141 - 7wANyBLd5VM : 120 stored.
2019-06-26 12:59:23.295161 - wYm3hZC1xB0 : 140 stored.
2019-06-26 12:59:55.493320 - qTV56N-EaqM : 140 stored.
2019-06-26 1

#### Get number of views

In [4]:
def get_view_count(browser, url):
    """Return the view_count and timestamp"""
    browser.get(url)
    time.sleep(3)
    now = datetime.now()
    sel = 'span.view-count'
    view_count = browser.find_element_by_css_selector(sel).text

    return {
        'view_count': int(''.join([n for n in view_count if n.isdigit()])),
        'timestamp': now }

In [13]:
# Get video list from videos collection
coll = db['video_detail']
cur = coll.find({})

In [14]:
# Store it to list
videos = [video for video in cur]

In [15]:
len(videos)

264

In [78]:
# shuffle the order of videos, that need to be collected
random.shuffle(videos) 

In [79]:
videos[0]

{'_id': ObjectId('5d1285238771f921313fee5d'),
 'video_id': 'itBh9J6RlXM',
 'title': 'Musik-A Kpop Contest 2019 | Pink Bullets | Whistle - Blackpink',
 'url': 'https://www.youtube.com/watch?v=itBh9J6RlXM'}

In [6]:
# Collect number of views
while True:
    coll = db['view_count']
    for video in videos:
        print(video['title'])
        count = get_view_count(browser, video['url'])
        coll.insert_one({
            'title': video['title'],
            'view_count': count['view_count'],
            'timestamp': count['timestamp']
        })
    time.sleep(600)


Audiofreq - Rewind (Official Video) 7664
Ully Moch & Ifan Seventeen - HUN (Official Music Video NAGASWARA) #music 327731
OLD TOWN ROAD Kids Dance Music Video 42069
Ayu Ting Ting x Keremcem - Apalah Cinta (Official Music Video) 7368888
Squeeze by Fille ft Voltage Music (Official Video) 11617
DAVIS - Alright (Official Music Video) 775
MC Klinton - SHQIPE TRAPKING prod.OGRruga (Official Video HD) 6563
Hez Hazmi - Biar Aku Pergi (Official Music Video) 92296
VRKINGS - SNAKEBITE FT. HONEY CRUX | BHAGUMIA | OFFICIAL MUSIC VIDEO | LATEST HINDI RAP | 2019 942
Hard Driver - Young Birds (Official Video) 52255
Julz West - On My Own (Dance Music Video) 253
Sule & Baby Shima - Terpisah Jarak Dan Waktu (Official Music Video NAGASWARA) #music 1613066
ALBUM SONIYA VE Official || ADITYA SINGH || Music Video Song || 2019 album 2219
Agung Pradanta - Kuat Ati ( Official Music Video ) 3752
BANNED KPOP Music Videos for Stupid Reasons 503555
MOUSTO CAMARA | Syli De Guinée | 🇬🇳Official Video 2019 | By Dj.IKK 3

In [None]:
browser.close()