In [None]:
import pandas as pd
from googleapiclient.discovery import build

In [None]:
api_key = 'Your API key goes here'

In [None]:
youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
# Load the combined datafile
df = pd.read_csv('Data/DataFile.csv')

In [None]:
# Parse the json data returned by the API call and create a dictionary
def extract_data(stats):
    # Create lists to store the data
    views = []
    likes = []
    dislikes = []
    num_comments = []
    video_ids = []
    video_durations = []

    # Use try blocks for potentially missing data
    for stat in stats['items']:
        try:
            views.append(stat['statistics']['viewCount'])
        except:
            views.append(None)
        try:
            likes.append(stat['statistics']['likeCount'])
        except:
            likes.append(None)
        try:
            dislikes.append(stat['statistics']['dislikeCount'])
        except:
            dislikes.append(None)
        try:
            num_comments.append(stat['statistics']['commentCount'])
        except:
            num_comments.append(None)
        try:
            video_ids.append(stat['id'])
        except:
            video_ids.append(None)
        try:
            video_durations.append(stat['contentDetails']['duration'])
        except:
            video_durations.append(None)

    # Create and return the new dataframe
    final_data = {'Video_ID':video_ids,'Views':views,'Likes':likes,'Dislikes':dislikes,'Num_Comments':num_comments, 'Duration':video_durations}
    return final_data

In [None]:
# Create a base dataframe that will be appended to later
video_data = pd.DataFrame(columns=['Video_ID','Views','Likes','Dislikes','Num_Comments','Duration'])
sample = []

num_records = df.shape[0]

# for each video in the datafile
for i,vid in enumerate(df.Video_ID):
    # Append the record
    sample.append(vid)
    
    # When we have stored 50 (the max number of records allowed in a query)
    if len(sample) == 50:
        # Print the status of the process
        print(f'{round((i/num_records) * 100,2)}% complete, {i}/{num_records}')
        
        # API call
        r = youtube.videos().list(part='contentDetails, statistics',id=sample)
        stats = r.execute()

        # Extract the data into a dictionary
        final_data = extract_data(stats)
        
        # Transform into a DatFrame
        new_data = pd.DataFrame(final_data)
        
        # Fold into the final Dataframe
        video_data = pd.concat([video_data,new_data])
        
        # Empty the list
        sample = []

# if there are still records stored in the sample list that have not been scraped
if len(sample) > 0:
    # Query the remaining data
    r = youtube.videos().list(part='contentDetails, statistics',id=sample)
    stats = r.execute()
    
    # Extract the data
    final_data = extract_data(stats)
    new_data = pd.DataFrame(final_data)

    # Store the data
    video_data = pd.concat([video_data,new_data])
    sample = []

In [None]:
# Save the file
video_data.to_csv('Data/DataFile_Stats.csv', encoding='utf-8', index=False)

In [None]:
# Read in the new stats file
stats = pd.read_csv('Data/DataFile_Stats.csv')

In [None]:
# Check for issues
stats

In [None]:
# Read in the base video info file
df = pd.read_csv('Data/DataFile.csv')

In [None]:
# Merge the two files together on the Video_ID column
df = df.merge(stats,on='Video_ID')

In [None]:
# Check the file out
df

In [None]:
# Save the file
df.to_csv('Data/Amalgamated_Data_File.csv', encoding='utf-8', index=False)