# Scrape Youtube Comments


#### This project will demonstrate how to scrape youtube comment for free without using Google API.

### GitHub Repo : [https://github.com/ahmedshahriar/youtube-comment-scraper](https://github.com/ahmedshahriar/youtube-comment-scraper) 

# Libraries & Configuration

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/yt_comment_stats')

In [3]:
"""
By default the below script will download most recent 100 comments of a youtube video
You can change the default filter (line 30 onwards)

Variables :
COMMENT_LIMIT : How many comments you want to download 
SORT_BY_POPULAR : filter comments by popularity (0 for True , 1 for false)
SORT_BY_RECENT : filter comments by recently posted (0 for True , 1 for false)
"""

import pandas as pd
import json
import os
import sys
import re
import time

import requests

# pandas dataframe display configuration
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_service_ajax'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
# csv file name
FILE_NAME = 'ytb_comments.csv'

# set parameters
# filter comments by popularity or recent, 0:False, 1:True
SORT_BY_POPULAR = 1
# default recent False, change to 1 to download latest comments
SORT_BY_RECENT = 0
# set comment limit
COMMENT_LIMIT = 500

YT_CFG_RE = r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;'
YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta|</script|\n)'

## Utility Function

In [4]:
def regex_search(text, pattern, group=1, default=None):
    match = re.search(pattern, text)
    return match.group(group) if match else default


def ajax_request(session, endpoint, ytcfg, retries=5, sleep=20):
    url = 'https://www.youtube.com' + endpoint['commandMetadata']['webCommandMetadata']['apiUrl']
    
    data = {'context': ytcfg['INNERTUBE_CONTEXT'],
            'continuation': endpoint['continuationCommand']['token']}

    for _ in range(retries):
        response = session.post(url, params={'key': ytcfg['INNERTUBE_API_KEY']}, json=data)
        if response.status_code == 200:
            return response.json()
        if response.status_code in [403, 413]:
            return {}
        else:
            time.sleep(sleep)

def download_comments(YOUTUBE_VIDEO_URL, sort_by=SORT_BY_RECENT, language=None, sleep=0.1):
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT
    response = session.get(YOUTUBE_VIDEO_URL)

    if 'uxe=' in response.request.url:
        session.cookies.set('CONSENT', 'YES+cb', domain='.youtube.com')
        response = session.get(YOUTUBE_VIDEO_URL)

    html = response.text
    ytcfg = json.loads(regex_search(html, YT_CFG_RE, default=''))
    if not ytcfg:
        return # Unable to extract configuration
    if language:
        ytcfg['INNERTUBE_CONTEXT']['client']['hl'] = language

    data = json.loads(regex_search(html, YT_INITIAL_DATA_RE, default=''))

    section = next(search_dict(data, 'itemSectionRenderer'), None)
    renderer = next(search_dict(section, 'continuationItemRenderer'), None) if section else None
    if not renderer:
        # Comments disabled?
        return

    needs_sorting = sort_by != SORT_BY_POPULAR
    continuations = [renderer['continuationEndpoint']]
    while continuations:
        continuation = continuations.pop()
        response = ajax_request(session, continuation, ytcfg)

        if not response:
            break
        if list(search_dict(response, 'externalErrorMessage')):
            raise RuntimeError('Error returned from server: ' + next(search_dict(response, 'externalErrorMessage')))

        if needs_sorting:
            sort_menu = next(search_dict(response, 'sortFilterSubMenuRenderer'), {}).get('subMenuItems', [])
            if sort_by < len(sort_menu):
                continuations = [sort_menu[sort_by]['serviceEndpoint']]
                needs_sorting = False
                continue
            raise RuntimeError('Failed to set sorting')

        actions = list(search_dict(response, 'reloadContinuationItemsCommand')) + \
                  list(search_dict(response, 'appendContinuationItemsAction'))
        for action in actions:
            for item in action.get('continuationItems', []):
                if action['targetId'] == 'comments-section':
                    # Process continuations for comments and replies.
                    continuations[:0] = [ep for ep in search_dict(item, 'continuationEndpoint')]
                if action['targetId'].startswith('comment-replies-item') and 'continuationItemRenderer' in item:
                    # Process the 'Show more replies' button
                    continuations.append(next(search_dict(item, 'buttonRenderer'))['command'])

        for comment in reversed(list(search_dict(response, 'commentRenderer'))):
            yield {'cid': comment['commentId'],
                   'text': ''.join([c['text'] for c in comment['contentText'].get('runs', [])]),
                   'time': comment['publishedTimeText']['runs'][0]['text'],
                   'author': comment.get('authorText', {}).get('simpleText', ''),
                   'channel': comment['authorEndpoint']['browseEndpoint'].get('browseId', ''),
                   'votes': comment.get('voteCount', {}).get('simpleText', '0'),
                   'photo': comment['authorThumbnail']['thumbnails'][-1]['url'],
                   'heart': next(search_dict(comment, 'isHearted'), False)}

        time.sleep(sleep)

def search_dict(partial, search_key):
    stack = [partial]
    while stack:
        current_item = stack.pop()
        if isinstance(current_item, dict):
            for key, value in current_item.items():
                if key == search_key:
                    yield value
                else:
                    stack.append(value)
        elif isinstance(current_item, list):
            for value in current_item:
                stack.append(value)

In [5]:
def main(url):
    """
    This function will save the comments into a dataframe and output a csv file
    By default, it will append the comments in csv, not overwriting them, change it in line 34
    To preview the comments in json, uncomment the lines - 22 and 23
    """
    df_comment = pd.DataFrame()
    try:
        youtube_url = url
        limit = COMMENT_LIMIT

        print('Downloading Youtube comments for video:', youtube_url)

        count = 0

        start_time = time.time()

        for comment in download_comments(youtube_url):

            df_comment = df_comment.append(comment, ignore_index=True)

            # comments overview in json
            # comment_json = json.dumps(comment, ensure_ascii=False)
            # print(comment_json)

            count += 1

            if limit and count >= limit:
                break

        print("DataFrame Shape: ",df_comment.shape,"\nComment DataFrame: ")
        display(df_comment)

        if not os.path.isfile(FILE_NAME):
            df_comment.to_csv(FILE_NAME, encoding='utf-8', index=False)
        else:  # else it exists so append without writing the header
            df_comment.to_csv(FILE_NAME, mode='a', encoding='utf-8', index=False, header=False)

        print('\n[{:.2f} seconds] Done!'.format(time.time() - start_time))

    except Exception as e:
        print('Error:', str(e))
        sys.exit(1)

# Save Comments

## Single Video Link

In [7]:
# dumping youtube comments

""" 
Dump comments to a csv  from a single video

"""
youtube_URL = 'https://www.youtube.com/watch?v=gu3FfmgkwUc'
main(youtube_URL)

Downloading Youtube comments for video: https://www.youtube.com/watch?v=gu3FfmgkwUc
Error: Failed to set sorting


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
df_comment = pd.read_csv('./data/ytb_comments.csv')
df_comment.shape

(60, 8)

## List of Links

In [6]:
# From Noha Kegan "How To make Money" List
# https://www.youtube.com/playlist?list=PLLb8lCtSZDWdM9djLiSW0SS3tB-gi4FKS

ytb_video_list = ['https://www.youtube.com/watch?v=_HaUBjkMf4A',
                  'https://www.youtube.com/watch?v=kDZw6ir22Wc',
                  'https://www.youtube.com/watch?v=gMGr6OALkug',
                  'https://www.youtube.com/watch?v=07_DgJYlljA',
                  'https://www.youtube.com/watch?v=9S1wgUlfABE',
                  'https://www.youtube.com/watch?v=4I1ymhJxGkg',
                  'https://www.youtube.com/watch?v=8shoPKlB3zI',
                  'https://www.youtube.com/watch?v=t0-4v3YWAmQ',
                  'https://www.youtube.com/watch?v=X4qPpJOIJMg',
                  'https://www.youtube.com/watch?v=qa9ki_Lz8r0',
                  'https://www.youtube.com/watch?v=2qosDS83uXE',
                  'https://www.youtube.com/watch?v=-SK6gbhuEXQ',
                  'https://www.youtube.com/watch?v=zI6i7iKrQ3U',
                  'https://www.youtube.com/watch?v=2yftgFzNhg0',
                  'https://www.youtube.com/watch?v=2N8-qDMpGAA',
                  'https://www.youtube.com/watch?v=rPB8tBib-iA']

for video_link in ytb_video_list:
    main(video_link)

Downloading Youtube comments for video: https://www.youtube.com/watch?v=_HaUBjkMf4A
Error: Failed to set sorting


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
"""
Dump to a csv from a a list with video links
"""
ytb_video_list = ['https://youtu.be/0sOvCWFmrtA',
                  'https://www.youtube.com/watch?v=TuIgtitqJho',
                  'https://www.youtube.com/watch?v=hinZO--TEk4',
                  'https://youtu.be/q6EoRBvdVPQ?list=PLFsQleAWXsj_4yDeebiIADdH5FMayBiJo']

for video_link in ytb_video_list:
    main(video_link)

Downloading Youtube comments for video: https://youtu.be/0sOvCWFmrtA
Error: Failed to set sorting


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
df_comment = pd.read_csv('./data/ytb_comments.csv')
df_comment.shape

(140, 8)

## Links From DataFrame

In [None]:
"""
Dump comments to a csv by parsing links from a csv with video links

Example -
Create a csv with one column titled 'link'
a sample is given below

'ytb_video_list.csv'

link
https://www.youtube.com/watch?v=-t_uhBBDbA4
https://www.youtube.com/watch?v=75vjjRza7IU
https://www.youtube.com/watch?v=j6dmaPzOBHY
https://www.youtube.com/watch?v=Yj2efyQV1RI
https://www.youtube.com/watch?v=HV652F7U6Qs
https://www.youtube.com/watch?v=47iXEucg3eo
https://www.youtube.com/watch?v=ofHXBLEE3TQ
https://www.youtube.com/watch?v=X6lGqSfVRT8
https://www.youtube.com/watch?v=a_-z9FhGBrE
https://www.youtube.com/watch?v=wTUM_4cVlE4


"""

youtube_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/PRO_commentNLP/ytb_video_list.csv")
youtube_data.link[:3].map(lambda x: main(x))

Downloading Youtube comments for video: https://www.youtube.com/watch?v=_HaUBjkMf4A
Error: Failed to set sorting


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Preview Output CSV

In [None]:
df_comment = pd.read_csv('./ytb_comments.csv')

print(f"{df_comment.shape[0]} rows, {df_comment.shape[1]} columns")
df_comment

In [None]:
df_comment.shape

(60, 8)

## Reference

1. [https://github.com/egbertbouman/youtube-comment-downloader](https://github.com/egbertbouman/youtube-comment-downloader)