In [1]:
from googleapiclient.discovery import build
from tqdm import tqdm
import pandas as pd
import os

## YouTube API service
First we need to create a service from which we can access the APIs and make requests

In [2]:
# creation of Youtube Data API service
api_key = 'AIzaSyCu2QSo3aGQfN6O003YulbSGsJThjD8i4k'
api_version = 'v3'
service_name = 'youtube'

youtube_service = build(service_name, api_version, developerKey=api_key)

These two functions allow to find the channel ID from a video, and then to get channel info such as:
- channel name
- channel description
- channel view count
- channel video count
- channel subscriber count

In [3]:
# function that finds channel id from video
def get_channel_id(service, video_id):
    request = service.videos().list(
        part = 'snippet',
        id = video_id
    )
    response = request.execute()
    if not len(response['items']):
        return None
        
    channel_id = response['items'][0]['snippet']['channelId']
    return channel_id

In [4]:
# function that retrieves info about the channel
def get_channel_data(service, channel_id):
    request = service.channels().list(
        part = 'snippet, statistics',
        id = channel_id
    )
    response = request.execute()
    if 'items' not in response.keys():
        return None
        
    response_items = response['items'][0]
    channel_info = {
        'channelId': channel_id,
        'title': response_items['snippet']['title'],
        'channelDescription': response_items['snippet']['description'],
        'viewCount': response_items['statistics']['viewCount'],
        'subscriberCount': response_items['statistics']['subscriberCount'],
        'videoCount': response_items['statistics']['videoCount'],
    }

    return channel_info

## Preprocessing
- read the .csv file (only ```Url_youtube``` and ```Channel``` columns)
- drop duplicated channels: the data we ask for will be the same every time
- substitute NaN values
- extract video ID from the URL

In [5]:
# read the dataset and keep only relevant info for this task
video_channel_df = pd.read_csv('../data/Spotify_Youtube.csv', usecols=['Url_youtube', 'Channel'])

In [6]:
# drop duplicates on channel
video_channel_df = video_channel_df.drop_duplicates(subset='Channel').reset_index(drop=True)
# fill NaN to avoid error between expected type str and actual type float
video_channel_df = video_channel_df.fillna('_')
# extract video id from URL
video_channel_df['Url_youtube'] = video_channel_df['Url_youtube'].apply(lambda x: x.split('?v=')[-1])

## API quota
YouTube Data API allows for max 10k requests per day.\
After the ```drop_duplicates```, this dataset has 6715 rows. Since we need 2 requests per row (video and channel), we need to split the data into 2 parts (and run the following function in two separate days...).

In [7]:
half_rows = int(len(video_channel_df) / 2)
video_channel_df_parts = [video_channel_df.iloc[:half_rows, :], video_channel_df.iloc[half_rows:, :]]

## Save .csv part file
This function saves data about a part of the dataset.\
In the code we take note of 3 types of anomalies:
1. ```error = 0``` indicates no errors
2. ```error = 1``` indicates rows for which we don't have Youtube data in our dataset
3. ```error = 2``` indicates rows for which we have YouTube data, but the video has been removed (recall that this dataset refers to Feb '23)
4. ```error = 3``` indicates rows for which we have YouTube data, the video exists but is not associated with any channel

In [8]:
def save_csv_part(service, parts, part_id):
    # lists to store data
    channel_ids = []
    title = []
    channel_description = []
    view_count = []
    subscriber_count = []
    video_count = []
    errors = []

    partition = parts[part_id]
    
    for index, row in tqdm(partition.iterrows(), total=partition.shape[0]):
        # default: error 1
        current_channel_id = '_'
        current_title = '_'
        current_channel_description = '_'
        current_view_count = '_'
        current_subscriber_count = '_'
        current_video_count = '_'
        error = 1

        # if we have YouTube data
        if row['Url_youtube'] != '_':
            # retrieve channel id
            channel_id = get_channel_id(service, row['Url_youtube'])
            # if there is channel id
            if channel_id is not None:
                # retrieve channel data
                channel_info = get_channel_data(service, channel_id)
                # if there is channel data
                if channel_info is not None:
                    current_channel_id = channel_info['channelId']
                    current_title = channel_info['title']
                    current_channel_description = channel_info['channelDescription']
                    current_view_count = channel_info['viewCount']
                    current_subscriber_count = channel_info['subscriberCount']
                    current_video_count = channel_info['videoCount']
                    error = 0
                else:
                    # if no channel data -> err 3
                    current_channel_id = '_'
                    current_title = '_'
                    current_channel_description = '_'
                    current_view_count = '_'
                    current_subscriber_count = '_'
                    current_video_count = '_'
                    error = 3
            else:
                # if there is no video -> err 2
                current_channel_id = '_'
                current_title = '_'
                current_channel_description = '_'
                current_view_count = '_'
                current_subscriber_count = '_'
                current_video_count = '_'
                error = 2

        # append the result
        channel_ids.append(current_channel_id)
        title.append(current_title)
        channel_description.append(current_channel_description)
        view_count.append(current_view_count)
        subscriber_count.append(current_subscriber_count)
        video_count.append(current_video_count)
        errors.append(error)

    # create new dataframe to store all the data about channels and save it in a csv file
    channels = pd.DataFrame({
        'channelId': channel_ids,
        'title': title,
        'channelDescription': channel_description,
        'viewCount': view_count,
        'subscriberCount': subscriber_count,
        'videoCount': video_count,
        'error': errors
    })
    channels.to_csv(f'../data/youtubeapi_channels_part{part_id + 1}.csv', index=False)

## Save the data
Run the first function call, then wait for the quota update to be able to run the second function call. See here for [quota details](https://developers.google.com/youtube/v3/determine_quota_cost).

Quota should reset at midnight Pacific Time, so 8 AM in Rome time zone.\
I can't find this specification in the YouTube Data API documentation, but I found [this error message](https://stackoverflow.com/questions/61512256/youtube-data-api-10-000-quota-reached-with-just-a-few-hundred-put-updates). Now the message only tells users that they have exceeded quota, without mentioning when it will resets.

In [13]:
save_csv_part(youtube_service, video_channel_df_parts, 0)

100%|███████████████████████████████████████████████████████████████████████████████| 3357/3357 [05:00<00:00, 11.17it/s]


In [9]:
save_csv_part(youtube_service, video_channel_df_parts, 1)

100%|███████████████████████████████████████████████████████████████████████████████| 3358/3358 [04:48<00:00, 11.62it/s]


## Merge into one single file
Now that we have both .csv files we can concat them, save a single .csv file and delete the part files.

In [6]:
part1 = pd.read_csv('../data/youtubeapi_channels_part1.csv')
part2 = pd.read_csv('../data/youtubeapi_channels_part2.csv')

part1['originalChannel'] = video_channel_df_parts[0]['Channel']
part2['originalChannel'] = video_channel_df_parts[1]['Channel']

complete_df = pd.concat([part1, part2])
complete_df.to_csv('../data/youtubeapi_channels_complete.csv', index=False)

In [None]:
if os.path.exists('../data/youtubeapi_channels_part1.csv'):
    os.remove('../data/youtubeapi_channels_part1.csv')
if os.path.exists('../data/youtubeapi_channels_part2.csv'):
    os.remove('../data/youtubeapi_channels_part2.csv')