In [48]:
#importing the libraries
import pandas as pd
import json
from googleapiclient.discovery import build

In [49]:
#loading the csv file containing the key
key_csv = pd.read_csv('api_key.csv')
#extracting the key from the csv file
key = str(key_csv['api_key'][0])

#init the api variables
api_service_name = "youtube"
api_version = "v3"
#creating the api client
youtube = build(api_service_name, api_version, developerKey=key)

In [50]:
#sending a request to the api for region data
request_region = youtube.i18nRegions().list(
    part="snippet",
    #hl="es_US"
)
#executing the api request
response_region = request_region.execute()
#dumping the region data into the json format
region_str = json.dumps(response_region)

'''
#opening a json file to write the region data
with open('Region_json.json', 'w') as outfile:
    outfile.write(region_str) #write to the json file
'''

#region_list = [response_region['items'][i]['snippet'] for i in range(len(response_region['items']))]
#region_code_list = [response_region['items'][i]['snippet']['gl'] for i in range(len(response_region['items']))]
#region_name_list = [response_region['items'][i]['snippet']['name'] for i in range(len(response_region['items']))]

#extracting region data from json to dictionary using dict comprehension
region_list = {response_region['items'][i]['snippet']['gl'] : response_region['items'][i]['snippet']['name'] for i in range(len(response_region['items']))}
#extracting keys from the dict comprehension
region_code_list = list(region_list.keys())

In [51]:
#sending a request to the api for category data
request_category = youtube.videoCategories().list(
    part="snippet",
    regionCode = 'US'
)
#executing the api request
response_cat = request_category.execute()
#dumping the category data into the json format
category_str = json.dumps(response_cat)

In [52]:
#opening a json file to write the region data
with open('Category_json.json', 'w') as outfile:
    outfile.write(category_str) #write to the json file

In [53]:
#init the id variable
id = 0
#init the database list
db = []
#init the for loop for the iterating through all the region
for r in range(len(region_list)):
    #sending a request to the api for popular video data
    request = youtube.videos().list(
    part = "snippet,contentDetails,statistics,status",
    chart = "mostPopular",
    regionCode = region_code_list[r],
    maxResults = 50,
    hl="es_US"
    )
    #executing the api request
    response = request.execute()

    #init the for loop for the iterating through all the items in the response
    for i in range(len(response['items'])):
            #increment id by 1
            id += 1
            #init a dictionary for extracting data from the json object to dict
            data = dict(id = id,
                        #extracting individual information from the json format response 
                        video_name = response['items'][i]['snippet']['title'], 
                        channel_name = response['items'][i]['snippet']['channelTitle'],
                        channel_id = response['items'][i]['snippet']['channelId'],
                        date = response['items'][i]['snippet']['publishedAt'],
                        duration = response['items'][i]['contentDetails']['duration'],
                        region_code = region_code_list[r],
                        region_name = region_list[region_code_list[r]],
                        category = response['items'][i]['snippet']['categoryId'],
                        statistics = response['items'][i]['statistics']
                        )
            #appending the dict data to the list database
            db.append(data)

In [54]:
#converting the dictionary to the dataframe
df = pd.DataFrame.from_dict(db)

In [55]:
#printing a sample of the dataframe
df.head()

Unnamed: 0,id,video_name,channel_name,channel_id,date,duration,region_code,region_name,category,statistics
0,1,ملخص مباراة البرتغال والأوروغواي - البرتغال ته...,beIN SPORTS,UCJUCcJUeh0Cz2xyKwkw5Q1w,2022-11-28T21:28:44Z,PT4M56S,AE,United Arab Emirates,17,"{'viewCount': '2581021', 'likeCount': '49702',..."
1,2,ITZY “Cheshire” M/V @ITZY,JYP Entertainment,UCaO6TYtlC8U5ttz62hTrZgg,2022-11-30T08:58:09Z,PT3M8S,AE,United Arab Emirates,10,"{'viewCount': '3375142', 'likeCount': '537367'..."
2,3,تحدي الثلاثين نسخة كأس العالم|كلاسيكو عبدالله ...,M12 ALFOUZAN - قدم الكرة,UCqe2V_eejw-b5SeOw63TQNw,2022-11-30T10:00:04Z,PT28M42S,AE,United Arab Emirates,17,"{'viewCount': '282450', 'likeCount': '44190', ..."
3,4,ملخص مباراة ويلز وإنكلترا | المنتخب الإنكليزي ...,beIN SPORTS,UCJUCcJUeh0Cz2xyKwkw5Q1w,2022-11-29T21:23:19Z,PT5M2S,AE,United Arab Emirates,17,"{'viewCount': '1388618', 'likeCount': '27587',..."
4,5,الكورة مع السلامة | الحلقة الثانية - كولومبيا 🇨🇴,Da7ee7 - الدحيح,UCqW7G8SmyeEeQYzLOk5tdSg,2022-11-29T18:01:24Z,PT26M50S,AE,United Arab Emirates,28,"{'viewCount': '372223', 'likeCount': '38760', ..."


In [56]:
#init a function to extract subscriber stats of the respective channels from json format
def subsciber_count(text):
    #sending a request to the api for subscriber stats of the channels
    request = youtube.channels().list(
        part="snippet,statistics,contentDetails",
        id=str(text)
    )
    #executing the api request
    response = request.execute()
    return response['items'][0]['statistics']
    #sub_channel = response['items'][0]['statistics']
    #return sub_channel

In [57]:
#using a set to extract unique channel id from the dataframe
channel_id_list = list(set(df['channel_id']))
#extracting channel stats using dict comprehensions and calling the subscriber_count function
channel_stats = {channel_id_list[i] : subsciber_count(channel_id_list[i]) for i in range(len(channel_id_list))}
#mapping channel stats to channel id in the dataframe
df['channel_stat'] = df['channel_id'].map(channel_stats)

In [58]:
#printing a sample of the dataframe
df.head()

Unnamed: 0,id,video_name,channel_name,channel_id,date,duration,region_code,region_name,category,statistics,channel_stat
0,1,ملخص مباراة البرتغال والأوروغواي - البرتغال ته...,beIN SPORTS,UCJUCcJUeh0Cz2xyKwkw5Q1w,2022-11-28T21:28:44Z,PT4M56S,AE,United Arab Emirates,17,"{'viewCount': '2581021', 'likeCount': '49702',...","{'viewCount': '662059394', 'subscriberCount': ..."
1,2,ITZY “Cheshire” M/V @ITZY,JYP Entertainment,UCaO6TYtlC8U5ttz62hTrZgg,2022-11-30T08:58:09Z,PT3M8S,AE,United Arab Emirates,10,"{'viewCount': '3375142', 'likeCount': '537367'...","{'viewCount': '18109940685', 'subscriberCount'..."
2,3,تحدي الثلاثين نسخة كأس العالم|كلاسيكو عبدالله ...,M12 ALFOUZAN - قدم الكرة,UCqe2V_eejw-b5SeOw63TQNw,2022-11-30T10:00:04Z,PT28M42S,AE,United Arab Emirates,17,"{'viewCount': '282450', 'likeCount': '44190', ...","{'viewCount': '70894383', 'subscriberCount': '..."
3,4,ملخص مباراة ويلز وإنكلترا | المنتخب الإنكليزي ...,beIN SPORTS,UCJUCcJUeh0Cz2xyKwkw5Q1w,2022-11-29T21:23:19Z,PT5M2S,AE,United Arab Emirates,17,"{'viewCount': '1388618', 'likeCount': '27587',...","{'viewCount': '662059394', 'subscriberCount': ..."
4,5,الكورة مع السلامة | الحلقة الثانية - كولومبيا 🇨🇴,Da7ee7 - الدحيح,UCqW7G8SmyeEeQYzLOk5tdSg,2022-11-29T18:01:24Z,PT26M50S,AE,United Arab Emirates,28,"{'viewCount': '372223', 'likeCount': '38760', ...","{'viewCount': '34045688', 'subscriberCount': '..."


In [59]:
#saving the pre processed dataframe to a csv file
df.to_csv('youtube_data_preprocessed.csv', index=False)