My original project was to compare the comment sections of both CinemaSins and CinemaWins.  However, I needed to make sure that I only compared the comment sections of videos in which both channels covered the same movie.This notebook is a summary of my code that I used to identify those videos and then subsequently obtained the comments in those videos.

# Imports and Initializations

In [26]:
import requests
import time
import pandas as pd
import warnings
import numpy as np      
import regex as re
from random import sample

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

# Functions
 - The following table was made to keep track of the API call keywords and what other information they were related to that I needed to know.

|Method         |Id/Filters Available        |Parts Available              |Other Parameters                     |
|---------------|----------------------------|-----------------------------|-------------------------------------|
|channels       |categoryId, id              |               statistics, id|maxRestuls                           |
|comments       |parentId, id                |                           id|maxResults,textFormat                |
|commentThreads |allThreadsRelatedToChannelId, videoId|                  id|n/a                                  |
|guideCategories|regionCode, id              |                          n/a|n/a                                  |
|search         |videoCategoryId,relatedToVideoId|                      n/a|type, order, topicId, videoCategoryId|
|subscriptions  |channelId, id       |contentDetails, subscriberSnippet, id|forChannelId, maxResults, order      |
|videoCategories|regionCode, id              |                          n/a|n/a                                  |
|videos         |mostPopular, id             |  statistics, suggestions, id|n/a                                  |


 - Extra: CHANNEL PARTS: contentDetails, contentOwnerDetails,status, topicDetails
 - Extra: VIDEO PARTS: contentDetails, fileDetails,id, liveStreamingDetails, player,snippet,statistics, status, suggestions, topicDetails
 - Extra: SEARCH PARAMS: type, maxResults, channelId, order, publishedAfter/Before, q (aka query term), topicId, videoCaption, videoCategoryId, videoDuration
 - Search order: date, rating, relevance, title, videoCount, viewCount

In [2]:
# Used primarily to discover the Youtube API and figure out what works and doesn't.
params = {}
def Youtube_API(method, params = params):
    endpoint = 'https://www.googleapis.com/youtube/v3/' + str(method)
    params['key'] = 'AIzaSyCruFbHTolS_lK_AHrakQrjSVLzdEO5MaI'
    res = requests.get(url = endpoint,params = params)
#    if int(str(res)[11:14]) == 200 & verbose == 1:
#        display(res.json())
    return res

In [4]:
# Used as a jumping off point for later
def obtain_video_comments_listed(VideoId, max_comments = 10000):
    list_of_items = []                                       # Instantiate list_of_items
    endpoint = 'https://www.googleapis.com/youtube/v3/commentThreads'
    first_params = {                                         # Instantiate first Parameters
        'part':'id,snippet',
        'videoId':str(VideoId),
        'key':'AIzaSyCruFbHTolS_lK_AHrakQrjSVLzdEO5MaI'
    }
    params = first_params                                    # Set params equal to first parameters
    
    for page in range(int(max_comments/20)):                 # For the number of pulls
        try:
            res=requests.get(url=endpoint,params=params)     # Makes requests, starting with 1st
            for i in range(20):
                list_of_items.append(res.json()['items'][i]) # Adds each item to list_of_items
            next_page = res.json()['nextPageToken']          # Sets the next_page Token
            params = {                                       # Re-defines parameters to pull next page
                'part':'id,snippet',
                'videoId':str(VideoId),
                'key':'AIzaSyCruFbHTolS_lK_AHrakQrjSVLzdEO5MaI',
                'pageToken' : next_page
            }
            print("Page#:", page+1, "comments:", (page+1)*20, "Next:", next_page[8:18]) # Sanity Check

        except:                                               # Try/Except to ensure we stop at end
            print('Limit likely hit.  Returning available posts.')
            break
        
    #Turning list into dictionaries:
    list_of_dicts = []
    for item in range(len(list_of_items)):
        quick_dict = {
            "comment_id": list_of_items[item]['id'],
            "replies": list_of_items[item]['snippet']['totalReplyCount'],
            'author_id': list_of_items[item]['snippet']['topLevelComment']['snippet']['authorChannelId']['value'],
            'author_name': list_of_items[item]['snippet']['topLevelComment']['snippet']['authorDisplayName'],
            'likes': list_of_items[item]['snippet']['topLevelComment']['snippet']['likeCount'],
            'published_time': list_of_items[item]['snippet']['topLevelComment']['snippet']['publishedAt'],
            'text': list_of_items[item]['snippet']['topLevelComment']['snippet']['textOriginal'],
            'video_id': list_of_items[item]['snippet']['topLevelComment']['snippet']['videoId']
            }
        list_of_dicts.append(quick_dict)
    return list_of_dicts

In [6]:
# Definition of function for pulling (hopefully) all the videos for a channel
def obtain_videos(channelId):
    video_list = []
    params = {
        'part':'snippet',
        'channelId': str(channelId),
        'maxResults': '50'}
    for page in range(100):                                  # Pointlessly High Number
        try:                                                 # Try/Except to ensure we stop at end
            res = Youtube_API('search', params)              # Uses prev func to search API 
            for i in range(50):                              # for each item in the page
                if res.json()['items'][i]['id']['kind'] == 'youtube#video': # Removing non-videos
                    video_list.append(res.json()['items'][i])# Appending videos to video_list
            next_page = res.json()['nextPageToken']          # Sets the next_page Token
            params = {                                       # Re-defines parameters to pull next page
                'part':'snippet',
                'channelId': str(channelId),
                'maxResults': '50',
                'pageToken' : str(next_page)}
            print("Page#:", page+1)                           # Sanity Check
        except:                                               # Try/Except to ensure we stop at end
            print('Limit hit! Returning videos')
            break
    return video_list

# Obtaining Comments for same comments in both Cinema's

(w/s)ins_video_list - List of all videos and their info.  Raw.


(w/s)ins_titles     - List of all names of wins videos in order of Video List

(w/s)ins_tuples     - List of all names of wins videos and thie place in Video list in order of video list
(w/s)ins_cypher     - Only the numbers of videos in video list that are shared with Sins


In [8]:
wins_video_list = obtain_videos('UCL8h3ri2WN_-IbviBlWtUcQ')

Page#: 1
Page#: 2
Page#: 3
Limit hit! Returning videos


In [9]:
sins_video_list = obtain_videos('UCYUQQgogVeQY8cMQamhHJcg')

Page#: 1
Page#: 2
Page#: 3
Page#: 4
Page#: 5
Page#: 6
Page#: 7
Limit hit! Returning videos


In [15]:
hiding_code = sample(range(0,30),30)

In [16]:
# Defining Wins Titles Tuples List
wins_tuples = []
count = 0                                                   # Count will be the video number
hiding_code = sample(range(100,999),30)
hide_num = 1
for video in wins_video_list:  
    if video['snippet']['title'][0] == 'E':                 # If it's viable video
        title = video['snippet']['title']                   # Defining the title
        title = title.replace("Everything GREAT About ","") # Removing start of title
        title = title.strip('!')                            # Removing Trailing Exclimation Point
        wins_tuples.append((count,title))
    else:
        wins_tuples.append((count,
                'Other Video' + str(hiding_code[hide_num])))
        hide_num += 1
    count += 1

In [17]:
# Defining Sins Titles Tuples List
sins_tuples = []
count = 0
for video in sins_video_list:  
    if video['snippet']['title'][0] == 'E':                 # If it's viable video
        title = video['snippet']['title']                   # Defining the title
        title = title.replace("Everything Wrong With ","")  # Removing start of title
        title = title.replace(" Minutes Or Less","")        # Removing most of end of title
        if title[-5:-3] == 'In':                            # Removing " In ##"
            title = title[:-6]
        elif title[-4:-2] == 'In':
            title = title[:-5]
        sins_tuples.append((count,title))
    else:
        sins_tuples.append((count,
            'Other Video' + str(hiding_code[hide_num])))
        hide_num += 1
    count += 1

The following was used to check by human eye to make sure that things were linning up and working out properly.

In [18]:
print(sorted([win[1] for win in wins_tuples if win[1] in [sin[1] for sin in sins_tuples]]))
print(sorted([sin[1] for sin in sins_tuples if sin[1] in [win[1] for win in wins_tuples]]))

['After Earth', 'Ant-Man', 'Avengers: Age of Ultron', 'Baby Driver', 'Batman v Superman: Dawn of Justice', 'Big Hero 6', 'Black Panther', 'Captain America: Civil War', 'Captain America: The First Avenger', 'Captain America: The Winter Soldier', 'Coco', 'Deadpool', 'Deadpool 2', "Ender's Game", 'Finding Dory', 'Finding Nemo', 'Home Alone', 'Inside Out', 'Jumanji: Welcome to the Jungle', 'Jurassic World', 'Kingsman: The Golden Circle', 'Kung Fu Panda', 'Logan', 'Megamind', 'Moana', 'Pacific Rim', 'Ready Player One', 'Rogue One: A Star Wars Story', 'Spider-Man', 'Spider-Man 2', 'Spider-Man 3', 'Spider-Man: Homecoming', 'Star Trek Beyond', 'Star Trek Into Darkness', 'Star Wars: Episode VII - The Force Awakens', 'Suicide Squad', 'The Avengers', 'The Bourne Identity', 'The Dark Knight', 'The Dark Knight Rises', 'The Equalizer', 'The Hunger Games', 'The Incredibles', 'The Jungle Book', 'The Last Airbender', 'The Lego Movie', 'The Maze Runner', 'The Wolverine', 'Trolls', 'Warcraft', 'Warm Bodi

In [19]:
wins_cypher = [win[0] for win in wins_tuples if win[1] in [sin[1] for sin in sins_tuples]]
sins_cypher = [sin[0] for sin in sins_tuples if sin[1] in [win[1] for win in wins_tuples]]
print(sorted(wins_cypher)) 
print(sorted(sins_cypher))

[0, 1, 2, 6, 7, 9, 11, 18, 20, 23, 24, 25, 26, 27, 28, 31, 32, 33, 39, 40, 41, 42, 43, 48, 57, 58, 60, 61, 65, 67, 72, 74, 77, 80, 81, 82, 86, 89, 90, 92, 93, 97, 98, 100, 104, 118, 120, 121, 124, 125, 130, 134, 140, 141, 142]
[3, 9, 11, 13, 23, 27, 29, 30, 36, 47, 58, 85, 88, 90, 99, 104, 107, 117, 121, 128, 130, 131, 132, 135, 149, 154, 161, 164, 170, 190, 198, 200, 220, 225, 228, 235, 237, 251, 253, 277, 287, 290, 291, 299, 300, 308, 309, 311, 314, 315, 323, 331, 345, 353, 359]


In [20]:
# Defining Wins Video Titles List
wins_titles = []
for video in wins_video_list:  
    if video['id']['kind'] == 'youtube#video':                          # If it's a video...
        if video['snippet']['title'][0] == 'E':                         # If it's viable
            title = video['snippet']['title']                           # Defining the title
            title = title.replace("Everything GREAT About ","")         # Removing start of title
            title = title.strip('!')
            wins_titles.append(title)
        else:
            wins_titles.append('Other Video')

In [21]:
# Defining Wins Video Titles List
sins_titles = []
for video in sins_video_list:  
    if video['id']['kind'] == 'youtube#video':                          # If it's a video...
        if video['snippet']['title'][0] == 'E':                         # If it's viable
            title = video['snippet']['title']                           # Defining the title
            title = title.replace("Everything Wrong With ","")          # Removing start of title
            title = title.replace(" Minutes Or Less","")                # Removing most of end of title
            if title[-5:-3] == 'In':                                    # Removing " In ##"
                title = title[:-6]
            elif title[-4:-2] == 'In':
                title = title[:-5]
            sins_titles.append(title)
        else:
            sins_titles.append('Other Video')

The following two cells were used to make sure that all of the bojects in each list were alligned.

In [None]:
# Sanity Check that the wins cypher list works
count = 0
for cypher in sorted(wins_cypher):
    print('cypher: ',wins_cypher[count])
    count += 1
    print('count: ',count)
    print('                      ', wins_titles[cypher])
    print('                ', wins_tuples[cypher])
    print(wins_video_list[cypher]['snippet']['title'])
    print("----------------------------------------")

In [None]:
# Sanity Check that the sins cypher list works
count = 0
for cypher in sorted(sins_cypher):
    print('cypher: ',sins_cypher[count])
    count += 1
    print('count: ',count)
    print('                     ', sins_titles[cypher])
    print('              ',sins_tuples[cypher])
    print(sins_video_list[cypher]['snippet']['title'])
#    print(sins_video_list[cypher]['id']['videoId'])
    print("----------------------------------------")

The following two cells actually collected the comments from all vieos that we were looking for and printed out what video it was on at each step.  The outputs have been cleared for ease of reading but for example, the first few lines of the following cell were:

The Avengers !!!

CWVaNzGpQI8

Page#: 1 comments: 20 Next: MVVvYTdDNE

Page#: 2 comments: 40 Next: MklIZ3gwWH

Page#: 3 comments: 60 Next: MUhuSUM2UX

Page#: 4 comments: 80 Next: ME9EVEJfdF

Page#: 5 comments: 100 Next: MFU5c1EtME

Page#: 6 comments: 120 Next: MzRBRjdtbU

In [None]:
# WINS with Sins!
list_of_items = []                                           # Instantiate list_of_items
endpoint = 'https://www.googleapis.com/youtube/v3/commentThreads' 
for cypher in sorted(wins_cypher):
    VideoId = wins_video_list[cypher]['id']['videoId']
    first_params = {                                         # Instantiate first Parameters
    'part':'id,snippet',
    'videoId':str(VideoId),
    'key':'AIzaSyCruFbHTolS_lK_AHrakQrjSVLzdEO5MaI'
    }
    params = first_params  
    print(wins_titles[cypher],'!!!')
    print(wins_video_list[cypher]['id']['videoId'])
    for page in range(int(50000/20)):                        # For the number of pulls
        try:
            res=requests.get(url=endpoint,params=params)     # Makes requests, starting with 1st
            for i in range(20):
                list_of_items.append(res.json()['items'][i]) # Adds each item to list_of_items
            next_page = res.json()['nextPageToken']          # Sets the next_page Token
            params = {                                       # Re-defines parameters to pull next page
                'part':'id,snippet',
                'videoId':str(VideoId),
                'key':'AIzaSyCruFbHTolS_lK_AHrakQrjSVLzdEO5MaI',
                'pageToken' : next_page
            }
            print("Page#:", page+1, "comments:", (page+1)*20) # Sanity Check

        except:                                               # Try/Except to ensure we stop at end
            print('Limit likely hit. Returning available posts.')
            break
list_of_dicts = []
for item in range(len(list_of_items)):
    quick_dict = {
        "comment_id": list_of_items[item]['id'],
        "replies": list_of_items[item]['snippet']['totalReplyCount'],
        'author_id': list_of_items[item]['snippet']['topLevelComment']['snippet']['authorChannelId']['value'],
        'author_name': list_of_items[item]['snippet']['topLevelComment']['snippet']['authorDisplayName'],
        'likes': list_of_items[item]['snippet']['topLevelComment']['snippet']['likeCount'],
        'published_time': list_of_items[item]['snippet']['topLevelComment']['snippet']['publishedAt'],
        'text': list_of_items[item]['snippet']['topLevelComment']['snippet']['textOriginal'],
        'video_id': list_of_items[item]['snippet']['topLevelComment']['snippet']['videoId']
        }
    list_of_dicts.append(quick_dict)
wins_df = pd.DataFrame(list_of_dicts, columns = ['text', 'likes', 'replies', 'published_time', 
                                                 'comment_id', 'author_id', 'author_name', 'video_id'])

In [None]:
# SINS with Wins!
list_of_items = []                                       # Instantiate list_of_items
endpoint = 'https://www.googleapis.com/youtube/v3/commentThreads'
params = first_params  
for cypher in sorted(sins_cypher):
    VideoId = sins_video_list[cypher]['id']['videoId']
    first_params = {                                         # Instantiate first Parameters
    'part':'id,snippet',
    'videoId':str(VideoId),
    'key':'AIzaSyCruFbHTolS_lK_AHrakQrjSVLzdEO5MaI'
    }
    params = first_params  
    print(sins_titles[cypher],'!!!')
    print(sins_video_list[cypher]['id']['videoId'])
    for page in range(int(50000/20)):                 # For the number of pulls
        try:
            res=requests.get(url=endpoint,params=params)     # Makes requests, starting with 1st
            for i in range(20):
                list_of_items.append(res.json()['items'][i]) # Adds each item to list_of_items
            next_page = res.json()['nextPageToken']          # Sets the next_page Token
            params = {                                       # Re-defines parameters to pull next page
                'part':'id,snippet',
                'videoId':str(VideoId),
                'key':'AIzaSyCruFbHTolS_lK_AHrakQrjSVLzdEO5MaI',
                'pageToken' : next_page
            }
            print("Page#:", page+1, "comments:", (page+1)*20) # Sanity Check

        except:                                               # Try/Except to ensure we stop at end
            print('Limit likely hit.  Returning available posts.')
            break
list_of_dicts = []
for item in range(len(list_of_items)):
    quick_dict = {
        "comment_id": list_of_items[item]['id'],
        "replies": list_of_items[item]['snippet']['totalReplyCount'],
        'author_id': list_of_items[item]['snippet']['topLevelComment']['snippet']['authorChannelId']['value'],
        'author_name': list_of_items[item]['snippet']['topLevelComment']['snippet']['authorDisplayName'],
        'likes': list_of_items[item]['snippet']['topLevelComment']['snippet']['likeCount'],
        'published_time': list_of_items[item]['snippet']['topLevelComment']['snippet']['publishedAt'],
        'text': list_of_items[item]['snippet']['topLevelComment']['snippet']['textOriginal'],
        'video_id': list_of_items[item]['snippet']['topLevelComment']['snippet']['videoId']
        }
    list_of_dicts.append(quick_dict)
sins_df = pd.DataFrame(list_of_dicts, columns = ['text', 'likes', 'replies', 'published_time', 
                                                 'comment_id', 'author_id', 'author_name', 'video_id'])