# Collect YouTube Dislikes Dataset

This notebook contains a code to collect YouTube dislikes dataset using YouTube Data API v3.

Firstly it is accumulated in the form list of lists using pickle, secondly - read and combined into a pandas dataframe.

In [1]:
import pickle
import pandas as pd

# API client library
import googleapiclient.discovery

# Collect dataset using youtube API

## Setting API

In [2]:
# API information
API_SERVICE_NAME = "youtube"
API_VERSION = "v3"
DEVELOPER_KEY_PATH = 'api_keys/API_key.txt'

try:
    f = open(DEVELOPER_KEY_PATH, 'r')
except OSError:
    print(f"Could not open/read file: {DEVELOPER_KEY_PATH}.")
    print("Specify the path to file, containing your API key.")
    print("To know more, visit:")
    print("https://developers.google.com/youtube/v3/quickstart/python#step_1_set_up_your_project_and_credentials")

with f:
    DEVELOPER_KEY = f.read()

# OR specify the key directly
# DEVELOPER_KEY = "YOUR_KEY"  # something like 'aAAsaah3yf4fgsg5htfj3hb_fdudfd8ew0f-d0d'
    
# API client
youtube = googleapiclient.discovery.build(API_SERVICE_NAME,
                                          API_VERSION,
                                          developerKey=DEVELOPER_KEY)

In [3]:
def get_video_info_by_id(video_id, youtube, return_list=False):
    """ 
    Get information about a video using YouTube Data API v3. 
    Parse the following information:
        video_id (str) - video id
        title (str) - video title
        description (str) - video description
        channelId (str) - channel id
        channelTitle (str) - channel title
        publishedAt (str) - video publication date
        tags (str) - tags if author specified, else " " or empty list
        viewCount (int) - number of views
        likeCount (int) - number of likes
        dislikeCount (int) - number of dislikes
        commentCount (int) - number of comments
        comments (str) - 20 video comments

    Parameters:
        video_id (str): YouTube video id
        youtube: googleapiclient.discovery.build object
        return_list (bool): If True, tags and comments are returned as lists of string
                            If False - as one concatenated string

    Returns:
        list, containing all items in the same order OR None in cases:
            if video_id is invalid or if comments are turned off
    """

    # youtube request for information about video
    request = youtube.videos().list(
        part="snippet, statistics",
        id=video_id)
    response = request.execute()

    if not response['items']:
        # empty list - video isn't available anymore
        return None

    # public fields that are 100% availible
    publishedAt = response['items'][0]['snippet']['publishedAt']
    channelId = response['items'][0]['snippet']['channelId']
    title = response['items'][0]['snippet']['title']
    description = response['items'][0]['snippet']['description']
    channelTitle = response['items'][0]['snippet']['channelTitle']

    # next fields can be hidden
    try:
        viewCount = int(response['items'][0]['statistics']['viewCount'])
    except KeyError:
        viewCount = None

    try:
        likeCount = int(response['items'][0]['statistics']['likeCount'])
    except KeyError:
        likeCount = None

    # is private from 13 December 2021
    try:
        dislikeCount = int(response['items'][0]['statistics']['dislikeCount'])
    except KeyError:
        dislikeCount = None

    # tags are unavailable if author didn't specify them
    try:
        tags_list = response['items'][0]['snippet']['tags']
        if return_list:
            tags = tags_list
        else:
            # list to string
            tags = ' '.join([tag for tag in tags_list])
    except KeyError:
        tags = None

    # if commentCount is unavailible, comments are turned off
    try:
        commentCount = int(response['items'][0]['statistics']['commentCount'])
    except KeyError:
        return None

    if commentCount == 0:
        comments = []
    else:
        # youtube request for comments information
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                maxResults=20,
                order="relevance",
                textFormat="plainText",
                videoId=video_id)
            response = request.execute()
        except Exception:
            # comments are unavailable
            return None

        if return_list:
            comments = [obj['snippet']['topLevelComment']['snippet']['textDisplay']
                        for obj in response['items']]
        else:
            # list to string
            comments = ' '.join([obj['snippet']['topLevelComment']['snippet']['textDisplay']
                                 for obj in response['items']])

    return [video_id, title, description, channelId, channelTitle,
            publishedAt, tags, viewCount, likeCount, dislikeCount,
            commentCount, comments]

In [4]:
def save_data(data, filename):
    """ Save data to file using pickle """

    pickle.dump(data, open(filename, "wb"))

## Read ids

In [5]:
# read ids collected using 'collect_unique_video_ids()' function
with open('video_IDs/unique_ids_from_kaggle.txt', 'r') as f:
    ids = f.read().splitlines()

ids[:10]

['s9FH4rDMvds',
 'jbGRowa5tIk',
 '3EfkCrXKZNs',
 'gBjox7vn3-g',
 'npoUGx7UW7o',
 'Vu6PNpYKu2U',
 'ly8jXKq_9AE',
 'QAUqqcEU0Xc',
 'eA4FRvf6vdM',
 '8f70QZQB4UA']

In [6]:
def request_loop(video_ids, filename, youtube, save_iter=False):
    """ 
    Iterate over video_ids and execute `get_video_info_by_id()` function
    Save data to filename_{}.p file using pickle.

    Parameters:
        video_ids (list of strings): list of YouTube video ids
        filename (str): path to file to save data using pickle
        youtube: googleapiclient.discovery.build object 
        save_iter (bool/int): If an integer, save data every save_iter iterations

    Returns:
        number of successful iterations (int)
    """

    youtube_data = []
    total = len(video_ids)
    if not video_ids:
        # empty list
        return 0

    for counter, video_id in enumerate(video_ids):
        try:
            curr = get_video_info_by_id(video_id, youtube)
        except Exception as e:
            print(str(e) + '\n')
            print("This is most likely Http Error 403 due to exceeded quota")
            print("To know more visit:" +
                  " https://developers.google.com/youtube/v3/getting-started#quota")
            break

        # append all data in the list
        youtube_data.append(curr)

        # save data every save_iter iterations
        if save_iter and counter != 0 and counter % save_iter == 0:
            save_data(youtube_data, f"{filename}_{counter+1}.p")
            print(f'\tSaved in "{filename}_{counter+1}.p"')

        print(f"{counter+1}/{total}: collect information about {video_id}")

    if youtube_data:
        # if list is not empty
        # save data in the end of the loop or if exception occurs
        save_data(youtube_data, f"{filename}_{counter+1}_final.p")
        print(f'\tSaved in f"{filename}_{counter+1}_final.p"')

    return counter

## Specify IDs

In [7]:
video_ids = ids[:10]
print(len(video_ids))
video_ids

10


['s9FH4rDMvds',
 'jbGRowa5tIk',
 '3EfkCrXKZNs',
 'gBjox7vn3-g',
 'npoUGx7UW7o',
 'Vu6PNpYKu2U',
 'ly8jXKq_9AE',
 'QAUqqcEU0Xc',
 'eA4FRvf6vdM',
 '8f70QZQB4UA']

## Main request loop

In [8]:
for i in range(5):
    # 5 tries to get data
    successful_iterations = request_loop(video_ids,
                                         f"data/{i+1}_youtube_data",
                                         youtube, save_iter=False)

    # skip one more video, that raised exception
    # in case an unknown exception occurred
    video_ids = video_ids[successful_iterations+1:]

1/10: collect information about s9FH4rDMvds
<HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2C+statistics&id=jbGRowa5tIk&key=AIzaSyCs0ogTX1Uky422ymWWPWcmN4n1zStIqsw&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

This is most likely Http Error 403 due to exceeded quota
To know more visit: https://developers.google.com/youtube/v3/getting-started#quota
	Saved in f"data/1_youtube_data_2_final.p"
<HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2C+statistics&id=3EfkCrXKZNs&key=AIzaSyCs0ogTX1Uky422ymWWPWcmN4n1zStIqsw&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/

# Convert dataset to dataframe

Next code allows to read pickle list of lists (obtained with YouTube Data API), convert them to pandas dataframes, concatenete and clean

In [1]:
import glob
import pickle
import string
import pandas as pd

In [2]:
def process_one_list(filename):
    """ Read 'filename' using pickle and convert list of lists to a pd.Dataframe """

    data = pickle.load(open(filename, 'rb'))
    data = list(filter(None, data))  # delete empty list elements

    df = pd.DataFrame(data, columns=['video_id', 'title', 'description', 'channelId', 'channelTitle',
                                     'publishedAt', 'tags',  'viewCount', 'likeCount', 'dislikeCount',
                                     'commentCount', 'comments'])

    # drop NaN values and convert columns to required data types
    df.dropna(inplace=True)
    df = df.astype({'publishedAt': 'datetime64',
                    'viewCount': 'int32',
                    'likeCount': 'int32',
                    'dislikeCount': 'int32',
                    'commentCount': 'int32'})

    return df

In [3]:
def process_lists(filenames):
    """ Convert all filenames to dataframes and concatenate them """

    list_of_df = []
    for filename in filenames:
        current_df = process_one_list(filename)
        list_of_df.append(current_df)

    dataset = pd.concat(list_of_df)
    dataset.reset_index(drop=True, inplace=True)

    return dataset

In [4]:
# In a `good_data` folder, I accumulated all the files that I managed to collect
pickle_files = glob.glob('good_data/*')
pickle_files

['good_data\\0_youtube_data_10_12_3496_final.p',
 'good_data\\0_youtube_data_10_12_4520_final.p',
 'good_data\\11_youtube_data_10_12_2_final.p',
 'good_data\\12_youtube_data_10_12_2_final.p',
 'good_data\\13_12_youtube_data_5102_final.p',
 'good_data\\13_12_youtube_data_5172_final.p',
 'good_data\\13_12_youtube_data_5290_final.p',
 'good_data\\13_youtube_data_10_12_4_final.p',
 'good_data\\16_youtube_data_10_12_2_final.p',
 'good_data\\18_youtube_data_10_12_3_final.p',
 'good_data\\1_youtube_data_11_12_1456_final.p',
 'good_data\\1_youtube_data_11_12_1884_final.p',
 'good_data\\1_youtube_data_11_12_190_final.p',
 'good_data\\1_youtube_data_11_12_190_finddal.p',
 'good_data\\1_youtube_data_11_12_195_final.p',
 'good_data\\1_youtube_data_11_12_198_final.p',
 'good_data\\1_youtube_data_11_12_284_final.p',
 'good_data\\1_youtube_data_11_12_356_final.p',
 'good_data\\1_youtube_data_11_12_4136_final.p',
 'good_data\\1_youtube_data_11_12_417_final.p',
 'good_data\\1_youtube_data_11_12_420_fin

In [5]:
raw_data = process_lists(pickle_files)
raw_data

Unnamed: 0,video_id,title,description,channelId,channelTitle,publishedAt,tags,viewCount,likeCount,dislikeCount,commentCount,comments
0,Ne4S6glyGec,XXXTENTACION NFT Collection - The Drop,YellowHeart presents: The XXXTENTACION NFT Col...,UCM9r1xn6s30OnlJWb-jc3Sw,XXXTENTACION,2021-05-10 15:51:56,NFT XXXTentacion YellowHeart YellowHeartNFT Bl...,1249548,155559,2909,18121,"LLJ 🕊 ""That bullet killed one person physicall..."
1,4Z6P_2Gyv98,"Version 1.5 ""Beneath the Light of Jadeite"" Tra...",Moonlight falls on the branches as a dragon's ...,UCiS882YPwZt1NfaM0gR0D9Q,Genshin Impact,2021-04-16 12:50:29,Amber amber vtuber genshi genshi game genshi i...,4918271,209673,800,18652,What a time to be playing this for the FIRST t...
2,3lfBDZaMqM4,Haunted Theories with Shane Dawson,NEXT PART - https://www.youtube.com/watch?v=CY...,UCV9_KinVpV-snHe3C3n1hvA,shane,2021-10-14 21:01:34,shane dawson journalism investigative journali...,3423228,183545,17372,18264,PART 3 UP NOW!!! Grab a meal and turn off the ...
3,Fop2oskTug8,iPhone 13 Unboxing & Impressions!,Unboxing every iPhone 13 model and some honest...,UCBJycsmduvYEL83R_U4JriQ,Marques Brownlee,2021-09-21 13:01:01,iPhone 13 iPhone 13 unboxing iPhone 13 review ...,7798485,261306,4773,18326,"OK ""recycled phone design"" got me I don't thin..."
4,ylZq4w90L8U,Israel Adesanya and Paulo Costa meet on the be...,VISIT: http://www.themaclife.com\nFOLLOW: http...,UC789h3eqw0H1HqGmIsI26OA,TheMacLife,2020-09-24 09:41:06,Conor McGregor UFC Dana White MMAFighting MMAJ...,2488203,41374,1221,15644,Izzy got so mad when he talked about his earri...
...,...,...,...,...,...,...,...,...,...,...,...,...
49774,YzD6NS1VSjg,I Drained an Ocean Monument in Minecraft Hardc...,Today I am going to be Draining an Ocean Monum...,UCRlEFn0L2G_DktbyvN0AZ5A,WadZee,2021-03-21 15:05:12,WadZee Draining an Ocean Monument in Minecraft...,9713893,386482,4781,29613,i can't even build a house and this dude drain...
49775,VR2o9OSyD3I,JE FAIS UN RECORD DU MONDE SUR CETTE OUVERTURE...,Hello guys I love you ! (au cas ou y'a des ang...,UCo3i0nUzZjjLuM7VjAVz4zA,Michou,2021-05-02 09:15:00,michou Michou pokemon pokemon display pokemon ...,8289410,644094,5353,31383,INCROYABLE 🔥Encore merci pour ce moment et cet...
49776,Xb6blQtSA8A,TommyInnit makes Minecraft 100000% Funnier..,sub pls : http://bit.ly/TxtGm8\nTwitch : http:...,UCS5Oz6CHmeoF7vSad0qqXfw,DanTDM,2021-03-06 18:00:17,dantdm dan tdm gaming video games minecraft funny,6405442,425338,3366,33799,"Dan: ""I'm actually doing alright!""\nDan: dies ..."
49777,hH-sYuRyZRU,we broke up,thanks in advance for the love. a tough pill t...,UCMk0u-pViW-Ff4F9SAGcI4g,Mike Majlak Vlogs,2020-10-30 20:06:10,logan paul jake paul mike majlak david dobrik ...,4869305,304724,10557,29438,"damn man, I can tell this was a hard video for..."


In [6]:
# raw data contains 49,779 rows

In [7]:
def clean_data(dataset):
    """
    Clean YouTube Dislike dataset:
        - Keep records only with unique video IDs 
          with the maximum number of views (the latest request)
        - Delete non-ASCII and non-English characters from text columns
        - Rename columns to snake case and reorder them
        - Delete non-relevant columns
        - Reset index
    """

    clean_df = dataset.copy(deep=True)

    # primary_key - unique 'video_id' with the largest number of views
    primary_key = clean_df.groupby("video_id")["viewCount"].idxmax()
    # keep only most relevant records
    clean_df = clean_df.loc[primary_key]

    printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n'
    # delete non-ascii characters
    for text_column in ['title', 'channelTitle', 'description', 'tags', 'comments']:
        # for all rows in the column apply a filter 
        # that only leaves characters from 'printable'
        # since filter does not return a string, then you need to use the join method
        clean_df[text_column] = clean_df[text_column].apply(
            lambda x: ''.join(filter(lambda xi: xi in printable, x)))
        
    # if there is not a single letter left in the title of the video
    # the video is definitely not in English
    symbols = [c for c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]
    clean_df = clean_df[clean_df['title'].str.contains('|'.join(symbols))]
    clean_df = clean_df[clean_df['channelTitle'].str.contains('|'.join(symbols))]
    
    # after deleting some characters, the description or tags could become empty
    clean_df.loc[clean_df['tags'] == '', 'tags'] = ' '
    clean_df.loc[clean_df['description'] == '', 'description'] = ' '

    # bring the dataset into the same shape of Kaggle dataset
    # rename columns
    clean_df.rename(columns={'channelId': 'channel_id',
                             'channelTitle': 'channel_title',
                             'publishedAt': 'published_at',
                             'viewCount': 'view_count',
                             'likeCount': 'likes',
                             'dislikeCount': 'dislikes',
                             'commentCount': 'comment_count'}, inplace=True)
    
    # and change the order
    clean_df = clean_df.reindex(columns=['video_id', 'title', 'channel_id', 'channel_title',
                                         'published_at', 'view_count', 'likes', 'dislikes',
                                         'comment_count', 'tags', 'description', 'comments'])
    
    clean_df.reset_index(drop=True, inplace=True)

    return clean_df

In [8]:
dataset = clean_data(raw_data)
dataset

Unnamed: 0,video_id,title,channel_id,channel_title,published_at,view_count,likes,dislikes,comment_count,tags,description,comments
0,--0bCF-iK2E,Jadon Sancho Magical Skills & Goals,UC6UL29enLNe4mqwTfAyeNuw,Bundesliga,2021-07-01 10:00:00,1048888,19515,226,1319,football soccer ftbol alemn Bundesliga season ...,Enjoy the best skills and goals from Jadon San...,"Respect to Dortmund fans,must be sad losing hi..."
1,--14w5SOEUs,Migos - Avalanche (Official Video),UCGIelM2Dj3zza3xyV3pL3WQ,MigosVEVO,2021-06-10 16:00:00,15352638,359277,7479,18729,Migos Avalanche Quality Control Music/Motown R...,"Watch the the official video for Migos - ""Aval...",Migos just makes me want to live my live to th...
2,--40TEbZ9Is,Supporting Actress in a Comedy: 73rd Emmys,UClBKH8yZRcM4AsRjDVEdjMg,Television Academy,2021-09-20 01:03:32,925281,11212,401,831,,Hannah Waddingham wins the Emmy for Supporting...,Hannah's energy bursts through any screen. Wel...
3,--4tfbSyYDE,JO1'YOUNG (JO1 ver.)' PERFORMANCE VIDEO,UCsmXiDP8S40uBeJYxvyulmA,JO1,2021-03-03 10:00:17,2641597,39131,441,3745,PRODUCE101JAPAN JO1 TheSTAR STA...,JO1'YOUNG (JO1 ver.)' PERFORMANCE VIDEO\n\n---...,youngVer>< REN is really PERFECT. It's not ju...
4,--DKkzWVh-E,Why Retaining Walls Collapse,UCMOqf8ab-42UUQIdVoKwjlQ,Practical Engineering,2021-12-07 13:00:00,715724,32887,367,1067,retaining wall New Jersey highway Direct Conne...,One of the most important (and innocuous) part...,Keep up with all my projects here: https://pr...
...,...,...,...,...,...,...,...,...,...,...,...,...
37417,zzd4ydafGR0,Lil Tjay - Calling My Phone (feat. 6LACK) [Off...,UCEB4a5o_6KfjxHwNMnmj54Q,Lil Tjay,2021-02-12 05:03:49,120408275,2180780,35871,81360,Lil Tjay Steady Calling My Phone Calling My Ph...,"Official video for ""Calling My Phone"" by Lil T...",'DESTINED 2 WIN' OUT NOW !! https://liltjay.ln...
37418,zziBybeSAtw,PELICANS at LAKERS | FULL GAME HIGHLIGHTS | Ja...,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,2021-01-16 05:39:05,2841917,20759,1049,2624,NBA G League Basketball game-0022000187 Lakers...,PELICANS at LAKERS | FULL GAME HIGHLIGHTS | Ja...,Montrezl Harrell is going crazy with the rebou...
37419,zzk09ESX7e0,[MV] (MAMAMOO) - Where Are We Now,UCuhAUMLzJxlP1W7mEk0_6lA,MAMAMOO,2021-06-02 09:00:10,13346678,720854,4426,90616,MAMAMOO WAW WAW MAMAMOO WAW Where Are We Now...,[MV] (MAMAMOO) - Where Are We Now\n\nInstagra...,I honestly do not know why this song hit so ha...
37420,zzmQEb0Em5I,FELLIPE ESCUDERO- Master Podcast #12,UC8NjnNWMsRqq11NYvHAQb1g,Master Podcast,2020-10-20 20:59:30,252057,19198,1234,1471,master masterpodcast lord lord vinheteiro z z ...,DOCTOR HAIR\nhttps://www.thedoctorhair.com/?fb...,Foi um prazer passar esta tarde com vocs debat...


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37422 entries, 0 to 37421
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   video_id       37422 non-null  object        
 1   title          37422 non-null  object        
 2   channel_id     37422 non-null  object        
 3   channel_title  37422 non-null  object        
 4   published_at   37422 non-null  datetime64[ns]
 5   view_count     37422 non-null  int32         
 6   likes          37422 non-null  int32         
 7   dislikes       37422 non-null  int32         
 8   comment_count  37422 non-null  int32         
 9   tags           37422 non-null  object        
 10  description    37422 non-null  object        
 11  comments       37422 non-null  object        
dtypes: datetime64[ns](1), int32(4), object(7)
memory usage: 2.9+ MB


In [10]:
# check that group by worked correctly
len(dataset['video_id'].unique())

37422

In [11]:
# save dataset
dataset.to_csv('data/youtube_dislike_dataset.csv', index=False, header=True)