#### All imports specified here

In [1]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import json
from urllib import parse
import googleapiclient.discovery
import googleapiclient.errors
import ast
import os
import re

#### All global variables are declared here

In [2]:
all_ds_path = "DS/"
wh_json_path = all_ds_path + "watch-history.json"
tags_html_tbl_path = all_ds_path + "topics.html" 
vd_resp_path = "VideoDetailsResponses/"
cd_resp_path = "ChannelDetailResponses/"

#### All functions definations are present here

In [None]:
def get_video_id_from_url(youtubeurl):
    video_id = parse.parse_qs(parse.urlparse(youtubeurl).query)['v'][0]
    return video_id

def create_youtube_client(api_key):
    api_service_name = "youtube"
    api_version = "v3"
    youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey = api_key)
    return youtube

def get_video_details(videoId):
    request = youtube.videos().list(
        part="snippet,contentDetails,statistics,topicDetails,status",
        id=videoId,
        fields="items(id,snippet(title,description,defaultLanguage,defaultAudioLanguage,channelId,channelTitle,tags,categoryId,liveBroadcastContent,publishedAt),contentDetails,statistics,status)"
    )
    response = request.execute()
    return response['items']

def get_channel_details(channelId):
    request = youtube.channels().list(
        part="snippet,topicDetails,statistics,brandingSettings,status",
        id=channelId,
        fields="items(id,snippet(title,description,defaultLanguage,customUrl,publishedAt,country),brandingSettings/channel/keywords,statistics,status)"
    )
    response = request.execute()
    return response['items']

def get_all_categories_details(cats):
    
    if len(cats) == 0:
        request = youtube.videoCategories().list(
            part="snippet",
            regionCode = "IN",
            fields="items(id,snippet)"
        )
    else:
        param_name = "id"
        param_value = cats
        request = youtube.videoCategories().list(
            part="snippet",
            id = cats,
            fields="items(id,snippet)"
        )
    
    response = request.execute()
    return response['items']

def get_all_language_details():
    request = youtube.i18nLanguages().list(
        part="snippet",
        fields="items(id,snippet)"
    )
    response = request.execute()
    return response['items']

def get_all_region_details():
    request = youtube.i18nRegions().list(
        part="snippet",
        fields="items(id,snippet)"
    )
    response = request.execute()
    return response['items']

def flatten_json_to_series(json):
    return json_normalize(json).squeeze()

def convert_csv_json_df(df,start):
    gdf = pd.DataFrame()
    i = 0
    for row in df.itertuples():
#         print(i, start)
        org_json =  ast.literal_eval(row.c0)
        cdf = pd.DataFrame(flatten_json_to_series(org_json))
        cdf.rename(columns={0:start},inplace=True)
        gdf = pd.concat([gdf,cdf],axis=1,sort=False)
        i+=1
        start+=1
    
    fcdf = gdf.T
    return fcdf

def prepare_csvs_details_list(fld_path):
    files = os.listdir(fld_path)
    files = [os.path.join(fld_path, f) for f in files] # add path to each file
    files.sort(key=lambda x: os.path.getmtime(x))
    
    flist = []
    for file in files:
        f_copy = file;

    #     f_copy = "VideoDetailsResponses/video_details_800_1600_json.csv"
#         f_copy = f_copy.replace("VideoDetailsResponses/video_details_","") #see how this can be handled
#         fstr = f_copy.replace("_json.csv","")
        
        f_copy = f_copy[(re.search("\d", f_copy)).start() : ]
        fstr = f_copy.replace("_json.csv","")
        
        tlst = fstr.split(":")
        if len(tlst) == 1:
            tlst = fstr.split("_")

        start = tlst[0]
        end = tlst[1]
    
        lst = []
        lst.append(file)
        lst.append(start)
        lst.append(end)
        flist.append(lst)
        
    return flist

def combine_csvs(final_df,wh_df,col_name,fld_path):
    flist = prepare_csvs_details_list(fld_path)
#     print(flist)
    
    for lst in flist:
        print("processing ", lst[0], final_df.shape)
        start = int(lst[1])
        end = int(lst[2])
        wh_df_part = wh_df[start:end].copy()
        json_res_ds_path = lst[0]
        
        json_res_ds_df = pd.read_csv(json_res_ds_path)
        temp_df = pd.DataFrame(json_res_ds_df[col_name]).rename(columns={col_name:"c0"})
        pdf = convert_csv_json_df(temp_df,start)
        semifinal_df = pd.concat([wh_df_part,pdf],axis=1)
        final_df = final_df.append(semifinal_df,sort=False)
    
    if 'index' in final_df:
        final_df.drop(columns=['index'],inplace=True)
        
    final_df.sort_index(inplace=True)
    return final_df

def assign_parent_tag(tags_df,tag_cat_indcs):
    for idx in range(len(tag_cat_indcs)):
        start = tag_cat_indcs[idx]
        
        if idx+1 >= len(tag_cat_indcs):
            end = tags_df.last_valid_index()
        else:
            end = tag_cat_indcs[idx+1]
#         print(start,end)
        
        p_tag = tags_df.loc[start][0]
#         print(p_tag)

        for i in range(start,end,1):
            tags_df.loc[tags_df.index[i], 'tag_parent'] = p_tag
#             print(i)
    
    tags_df.drop(tags_df[tags_df[1].isnull()].index,inplace=True)
    tags_df.rename(columns={0:"tag_id",1:"tag_name"},inplace=True)
    return tags_df



#### Create youtube client using API Key

In [None]:
API_KEY = "******************************"
youtube = create_youtube_client(API_KEY)

#### Read youtube watch history json file exported from google takeout

In [None]:
json_df = pd.read_json(wh_json_path)
# json_df

#### Clean removed video entries, entries related to visits of youtube music, subtitles are empty(channel details are not available)

In [None]:
#videos which has been removed, visited youtube music, subtitles are empty(channel details are not available)

title_url_null = json_df['titleUrl'].isnull()
subtitles_null = json_df['subtitles'].isnull()
# title_removed = json_df['title'] != 'Watched a video that has been removed')

json_bad_df = json_df[title_url_null | subtitles_null]

json_clean_df = json_df.drop(json_bad_df.index)

#### Create usable dataframe of youtube watch history

In [None]:
wh_df = json_clean_df[['time','titleUrl']].reset_index()
wh_df['videoId'] = wh_df['titleUrl'].apply(lambda x:get_video_id_from_url(x))

## Getting All Video Details from Youtube API

#### Get video details of a part of whole video dataframe by hitting youtube video api. 
###  <font color='red'> Don't run below cell if already have dataset, this will download data by hitting youtube API and have cost associated </font>

In [None]:
df_start = 0
df_end = 800

vd_csv_path = vd_resp_path + "video_details_" + str(df_start) + ":" + str(df_end) + "_json.csv" 

wh_df_part = wh_df[df_start:df_end].copy()
wh_df_part['video_details'] = wh_df_part['videoId'].apply(lambda x:get_video_details(x))

wh_df_part.to_csv(vd_csv_path,index=False)

#### Combine all individual collected video details csv

In [None]:
final_vd_df = pd.DataFrame()
final_vd_df = combine_csvs(final_vd_df,wh_df,"video_details",vd_resp_path)
# final_vd_df

#### Export all collected video details to csv

In [None]:
final_vd_df.to_csv(all_ds_path + "mohit_youtube_wh_ds.csv",index=False)

## Getting all Unique Channels Details from Youtube API

#### Get all unique channel ids from videos in final watch history dataframe

In [None]:
chnl_lst_df = pd.DataFrame(final_vd_df['snippet.channelId'].unique(),columns=['channelId'])
# chnl_lst_df

#### Get channel details of a part of whole channel id dataframe by hitting youtube video api
### <font color='red'> Don't run below cell if already have dataset, this will download data by hitting youtube API and have cost associated</font>

In [None]:
df_start = 4500
df_end = 5025

cd_csv_path = cd_resp_path + "channel_details_" + str(df_start) + ":" + str(df_end) + "_json.csv" 

chnl_lst_df_part = chnl_lst_df[df_start:df_end].copy()
chnl_lst_df_part['channel_details'] = chnl_lst_df_part['channelId'].apply(lambda x:get_channel_details(x))

chnl_lst_df_part.to_csv(cd_csv_path,index=False)

In [None]:
# chnl_lst_df_part

#### Combine all individual collected channel details csv

In [None]:
channel_final_df = pd.DataFrame()

channel_final_df = combine_csvs(channel_final_df,chnl_lst_df,"channel_details",cd_resp_path)

channel_final_df.to_csv(all_ds_path + "mohit_channel_details.csv",index=False)

In [None]:
# channel_final_df

### Getting details about unique categories

#### Getting unique category ids from videos dataframe

In [None]:
unq_cat_lst = list(final_vd_df['snippet.categoryId'].unique())
results = [int(i) for i in unq_cat_lst if str(i)!='nan']
cats_string = ",".join( map(str, sorted(set(results)) ))

#### Getting information about category from category id using youtube api

In [None]:
cat_response = get_all_categories_details(cats_string)
# cat_response
cat_df = flatten_json_to_series(cat_response)
cat_df.to_csv(all_ds_path + "mohit_all_categories.csv",index=False)

In [None]:
cat_response = get_all_categories_details("")
# cat_response
cat_df = flatten_json_to_series(cat_response)
cat_df.to_csv(all_ds_path + "all_in_categories.csv",index=False)

### Getting details about topics of a channel (topic ids)

#### Reading static details of topic details present  at https://developers.google.com/youtube/v3/docs/channels#topicDetails.topicIds[]

In [None]:
tags_df = pd.read_html(tags_html_tbl_path)[0]
tags_df.drop(0,inplace=True)
# tags_df

#### Getting indices of parent tags from tags dataframe

In [None]:
tag_cat_indcs = list(tags_df[tags_df[1].isnull()].index)
# tag_cat_indcs

#### Assign parent categories using info from dataframe itself

In [None]:
final_tags_df = assign_parent_tag(tags_df,tag_cat_indcs)
final_tags_df.to_csv(all_ds_path + "tags.csv",index=False)

### Getting details about all youtube languages

In [None]:
lang_response = get_all_language_details()
# lang_response
lang_df = flatten_json_to_series(lang_response)
lang_df.to_csv(all_ds_path + "lang_details.csv",index=False)

### Getting details about all youtube regions 

In [None]:
region_response = get_all_region_details()
# region_response
regions_df = flatten_json_to_series(region_response)
regions_df.to_csv(all_ds_path + "region_details.csv",index=False)
# regions_df

## Start Analysis of Data
### Start from here if already have data

In [3]:
final_vd_df = pd.read_csv(all_ds_path + "mohit_youtube_wh_ds.csv")
channel_final_df = pd.read_csv(all_ds_path + "mohit_channel_details.csv")
cat_df = pd.read_csv(all_ds_path + "mohit_all_categories.csv")
final_tags_df = pd.read_csv(all_ds_path + "tags.csv")
lang_df = pd.read_csv(all_ds_path + "lang_details.csv")
regions_df = pd.read_csv(all_ds_path + "region_details.csv")

In [None]:
# final_vd_df

In [4]:
final_vd_df[final_vd_df.columns.difference(['snippet.tags','contentDetails.regionRestriction.allowed','contentDetails.regionRestriction.blocked'])].nunique()

# final_vd_df['snippet.defaultLanguage'].unique()

# final_vd_df[final_vd_df['contentDetails.dimension'] == '3d']

contentDetails.caption                       2
contentDetails.contentRating.ytRating        1
contentDetails.definition                    2
contentDetails.dimension                     2
contentDetails.duration                   2359
contentDetails.licensedContent               2
contentDetails.projection                    2
id                                       14789
snippet.categoryId                          17
snippet.channelId                         5024
snippet.channelTitle                      5019
snippet.defaultAudioLanguage                34
snippet.defaultLanguage                     19
snippet.description                      13152
snippet.liveBroadcastContent                 2
snippet.publishedAt                      14444
snippet.title                            14745
statistics.commentCount                   5496
statistics.dislikeCount                   5806
statistics.favoriteCount                     1
statistics.likeCount                     10931
statistics.vi