# Youtube API Exploratory file

In [1]:
# import dependencies
import pandas as pd
from googleapiclient.discovery import build
from config import api_key
import numpy as np

In [2]:
youtube = build('youtube', 'v3', developerKey=api_key)

In [3]:
def channel_information_grabber(video_list, title):
    # create empty dataframe
    top_channels_df = pd.DataFrame()

    # for loop to get channel details
    for channel in video_list:
        response = youtube.channels().list(
            part=['snippet', 'statistics', 'topicDetails', 'contentDetails'],
            id=channel
        ).execute()
        # append response to dataframe
        top_channels_df = top_channels_df.append(response['items'], ignore_index=True)

    # seperate the snippet column into its own dataframe
    snippet_df = pd.json_normalize(top_channels_df['snippet'])
    snippet_df = snippet_df.drop(columns=['thumbnails.default.width', 'thumbnails.default.height', 'thumbnails.medium.url', 'thumbnails.medium.width', 'thumbnails.medium.height', 'thumbnails.high.url', 'thumbnails.high.width', 'thumbnails.high.height', 'localized.title', 'localized.description'])

    # seperate the statistics column into its own dataframe
    statistics_df = pd.json_normalize(top_channels_df['statistics'])

    # seperate the topicDetails column into its own dataframe
    topicDetails_df = pd.json_normalize(top_channels_df['topicDetails'])

    # seperate the contentDetails column into its own dataframe
    contentDetails_df = pd.json_normalize(top_channels_df['contentDetails'])

    top_channels_df = top_channels_df.merge(snippet_df, left_index=True, right_index=True)
    top_channels_df = top_channels_df.merge(statistics_df, left_index=True, right_index=True)
    top_channels_df = top_channels_df.merge(topicDetails_df, left_index=True, right_index=True)
    top_channels_df = top_channels_df.merge(contentDetails_df, left_index=True, right_index=True)
    
    # change data types
    top_channels_df['publishedAt'] = pd.to_datetime(top_channels_df['publishedAt'])
    top_channels_df['viewCount'] = top_channels_df['viewCount'].astype('int64')
    top_channels_df['subscriberCount'] = top_channels_df['subscriberCount'].astype('int64')
    top_channels_df['videoCount'] = top_channels_df['videoCount'].astype('int64')

    # Remove the url from Topic Categories
    # top_channels_df['topicCategories'] = top_channels_df['topicCategories'].str.replace('https://en.wikipedia.org/wiki/', '')
    
    # drop columns that are not needed
    top_channels_df = top_channels_df.drop(columns=['kind', 'etag', 'snippet', 'statistics', 'topicDetails', 'country', 'hiddenSubscriberCount', 'contentDetails', 'relatedPlaylists.likes'])

    # sort by view count
    top_channels_df = top_channels_df.sort_values(by='viewCount', ascending=False)

    # add 'title' column
    top_channels_df['category_title'] = title

    # export as csv
    top_channels_df.to_csv(f'{title}_df.csv', index=False)

    return top_channels_df

In [4]:
def get_50_videos(channel_id):
    # get most recent uploads
    response = youtube.channels().list(
        part=['contentDetails'],
        id=channel_id
    ).execute()

    # get the playlist id
    playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    # get the videos in the playlist
    response = youtube.playlistItems().list(
        part=['contentDetails'],
        playlistId=playlist_id,
        maxResults=50
    ).execute()

    # get the video ids
    video_ids = []
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])

    return video_ids

In [5]:
def video_details_grabber(video_id):
    response = youtube.videos().list(part=['snippet', 'statistics', 'topicDetails', 'contentDetails'], id=video_id).execute()

    # get channelID
    try:
        channel_id = response['items'][0]['snippet']['channelId']
    except:
        channel_id = np.nan

    # get video title
    try:
        video_title = response['items'][0]['snippet']['title']
        video_title_clean = ''.join(e for e in video_title if e.isalnum() or e == ' ')
        video_title_clean = video_title_clean.replace('  ', ' ')
    except:
        video_title = 'N/A'
        video_title_clean = 'N/A'

    # get video id
    try:
        video_id = response['items'][0]['id']
    except:
        video_id = np.nan

    # get the date time the video was published
    try:
        video_published = response['items'][0]['snippet']['publishedAt']
    except:
        video_published = None

    # get number of views
    try:
        video_views = response['items'][0]['statistics']['viewCount']
    except:
        video_views = 0

    # is the video made for kids?
    try:
        video_madeforkids = response['items'][0]['status']['madeForKids']
    except:
        video_madeforkids = False

    # get video description
    try:
        video_description = response['items'][0]['snippet']['description']
    except:
        video_description = ''

    # get like count
    try:
        video_likes = response['items'][0]['statistics']['likeCount']
    except:
        video_likes = 0

    # get nummber of comments
    try:
        video_comment_count = response['items'][0]['statistics']['commentCount']
    except:
        video_comment_count = 0

    # get length of video
    try:
        video_length = response['items'][0]['contentDetails']['duration']
        video_length = video_length.replace('PT', '')
        video_length = video_length.replace('H', ':')
        video_length = video_length.replace('M', ':')
        video_length = video_length.replace('S', '')
    except:
        video_length = None

    # get video description
    try:
        video_description = response['items'][0]['snippet']['description']
    except:
        video_description = ''

    # get video tags
    try:
        video_tags = response['items'][0]['snippet']['tags']
    except:
        video_tags = ''

    # retrieve youtube video results
    # video_response = youtube.commentThreads().list(part='snippet,replies', videoId=video_id).execute()

    # create empty dataframe
    df = pd.DataFrame(columns=[
        'channel_id',
        'video_title', 'video_title_clean', 'video_id', 'published', 'video_views', 'video_madeforkids', 
        'video_likes', 'video_comment_count', 'video_length', 'video_description', 'video_tags'])

    # append video details to dataframe
    df = df.append({
        'channel_id': channel_id,
        'video_title': video_title,
        'video_title_clean': video_title_clean,
        'video_id': video_id,
        'published': video_published,
        'video_views': video_views,
        'video_madeforkids': video_madeforkids,
        'video_likes': video_likes,
        'video_comment_count': video_comment_count,
        'video_length': video_length,
        'video_description': video_description,
        'video_tags': video_tags
    }, ignore_index=True)
    
    return df

In [6]:
top_channels = ['UCbCmjCuTUZos6Inko4u57UQ', 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'UCX6OQ3DkcsbYNE6H8uQQuVA', 'UCk8GzjMOrta8yxDcKfylJYw', 'UCJplp5SjeGSdVdwsfb9Q7lQ', 
'UCJ5v_MCY6GNUBTO8-D3XoAg', 'UCvlE5gTbOvjiolFlEm-c_Ow', 'UC295-Dw_tDNtZXFeAPAW6Aw', 'UCIwFjwMjI0y7PDBVEO9-bkQ', 'UCcdwLMPsaU2ezNSJU1nFoBQ', 
'UCRijo3ddMTht_IHyNSNXpNQ', 'UC3gNmTGu-TTbFPpfSs5kNkg', 'UCfM3zsQsOnfWNUppiycmBuw', 'UC0C-w0YjGpqDXGB8IHb662A', 'UC9CoOnJkIBMdeijd9qYoT_g', 
'UC4NALVCmcmL5ntpV0thoH6w', 'UCqECaJ8Gagnn7YCbPEzWH6g', 'UCRx3mKNUdl8QE06nEug7p6Q', 'UCiGm_E4ZwYSHV3bcW1pnSeQ', 'UC4rlAVgAK0SGk-yTfe48Qpw']

In [7]:
random_channels=[
    'UCEGGyGmo0NbAPmw1zVNdXbg',
    'UCIWC3bm2eKj0wLGFULhzKdA',
    'UC-yI6rR_EGSY8cVs4qHo0BQ',
    'UCGZrQMFEhuo6cV4wenQ6xLQ',
    'UCwo2cn9mViQ8Y9xKH3ZPKDA',
    'UC9QMc_tF7lIdTQLpezdOCxA',
    'UCIPX0CTQmfGb7PoRG0MNsnQ',
    'UCMffaLPtQHcuQ6UY83qtM0g',
    'UCECGTK7DkU7I8SSfBItn3uw',
    'UC1zhHVE2KIwnqfJ5r77cM7A',
    'UCwcEhBb53uJO4ogeBLLh2yQ',
    'UCPXW0F7HwfXhZY6-rZwJt4w',
    'UCfxUyLb5D-KpCt15hfGbtrA',
    'UCfV7GnI8E_RnxMjc5-x8mAQ',
    'UCykt3TNvH5xJwA3O_WVvQpA',
    'UCfKqi5ZUXqXxErUgTdMkSDQ',
    'UCLq4U2sExYRjYd0tMdmWUmA',
    'UC9LGi7FRFTxNT7WK6jr6OXw',
    'UC5TKWxKKl1IRfTZsVgS0ziA',
    'UCmHCmV4rVl_2_wg2l60nzjA',
    'UCMM9Z0_Ur9U_hpmTzGaHvaA',
    'UClgD65bB6SJPTPv20qIVKDw',
    'UCMQEmETLCqv088mFEIlLQyQ',
    'UCj5BN5C0lmsq3QptHFEvipg',
    'UCTNAGaw3TauzN7pkJTaZZtg',
    'UCXwFtLCUu10dJKKWD_TD2LQ',
    'UC8Ewe7WqGg01KRNjJCO5cjg',
    'UCL-yKZ_kCuKBHyeRfePpvuw',
    'UCs0Pz9jzByzbhQGsMoW4jcw',
    'UC_lf4_Wss_uW0KGny_A3erg',
    'UC_E5qa-s0VEocKVV1KPtNgw',
    'UCG-PXXY7gyI7LOMeE0rl_KA',
    'UCIzKAXlRpnPzulWndJ0SOSA',
    'UCzOwecflGQFfHvMp-vMJCgQ',
    'UCEHB7jMymvg63b6x30qx5Jw',
    'UCitSKTq6Ghg2bBKC4YdKd_Q',
    'UCaBFtf4I61T9UO9WKhUG-dA',
    'UCS-SLEeQ1F7k7mpmhnKVMKw',
    'UCdBjEJpySOp2fY6f3x6TfPg',
    'UCged4xNWHYsJTGPQ5rn6HmA',
    'UCf9Ua7W95KDm28d16otkm0Q',
    'UCjlgDApB1OrU_3-1dLMHOZg',
    'UCfOjz88tFouLzzDWqN9apKg',
    'UC22gUNj0sgOlBAR_zLfdh5A',
    'UCVaXclURQZlakiTMzuwHvRw',
    'UC6YN4FNhAKN3MDO5DbJSnOA',
    'UC4z1t_toTtWFG6PGpcgzWcw',
    'UCHU5LSiGQsCbdPKKreHcITw',
    'UCrWtfbfTrZn9penvY3xwWig',
    'UCAlZ-9e75wau2hY_wWFliNA',
    'UC2NTBsmAC2ePOrUkpRvogoA',
    'UCJDIvab5y2yIBzzkbIgy8WQ',
    'UCrdJ-n4brmIZy975U-4iwsQ',
    'UCxAIx1VAAKW0F7L5Xaqensg',
    'UCQqaNnVhS1w_iTeFaIJsXog',
    'UCf7J0vxbg6SsIjY9587PEiQ',
    'UCmTSflg4X32Qkxuz5Mw1k8w',
    'UCXQC_SxY3XXbvbombLEu5hg',
    'UCnODsMthEUVnqFSisw8E4ow',
    'UCitLqDolQHZ_tldN-bXGDwg',
    'UCtxFOnPpYdQuDog6fGlp1Eg',
    'UCBjoR2uNSYyBVRKq-ZLinuw',
    'UCGffDssIzCydUTNgFNIJyxQ',
    'UCTM1z1vpulAL3zt61PDDzkw',
    'UCjGC05n79MPgDcnhtsG5_cQ',
    'UCK45CFLlvd9pP7IzVYDWIMw',
    'UCx29pmyVOjk7ZinBNUDVgnQ',
    'UC2CtsATmHPFq8Ky5olCu3jQ',
    'UCaPrv_BwRFvpIsSnXSmGt0w',
    'UCC7jlYxfWti7WAW8r7ef1RQ'
]

In [8]:
top_education_channels = [
'UC1zZE_kJ8rQHgLTVfobLi_g', 
'UCpVm7bg6pXKo1Pr6k5kxG9A', 
'UCX6b17PVsYBQ0ip5gyeme-Q', 
'UCsooa4yRKGN_zEE8iknghZA',
'UCXhSCMRRPyxSoyLSPFxK7VA',
'UCGi_crMdUZnrcsvkCa8pt-g',
'UCZYTClx2T1of7BRZ86-8fow',
'UCHnyfMqiRRG1u-2MsSQLbXA',
'UCYenDLnIHsoqQ6smwKXQ7Hg',
'UC4a-Gbdw7vOaccHmFo40b9g']
top_cooking_channels = [
'UCJFp8uSYCjXOMnkUyb3CQ3Q',
'UCYjk_zY-iYR8YNfJmuzd70A',
'UCpSgg_ECBj25s9moCDfSTsA',
'UCsP7Bpw36J666Fct5M8u-ZA',
'UCJHA_jMfCvEnv-3kRjTCQXw', 
'UCNbngWUqL2eqRw12yAwcICg', 
'UC8gFadPgK2r1ndqLI04Xvvw', 
'UCRxAgfYexGLlu1WHGIMUDqw', 
'UCbpMy0Fg74eXXkvxJrtEn3w', 
'UCfyehHM_eo4g5JUyWmms2LA']
top_fitness_channels = [
'UCiP6wD_tYlYLYh3agzbByWQ',
'UCIJwWYOfsCfz6PjxbONYXSg', 
'UCM1Nde-9eorUhq-teaWlgUA', 
'UCBINFWq52ShSgUFEoynfSwg', 
'UCEtMRF1ywKMc4sf3EXYyDzw', 
'UCyqR7WkL8i1b6xtSssDmW9w', 
'UCGMOauU8dOd4mv2bT3Tx57w',
'UCEQi1ZNJiw3YMRwni0OLsTQ',
'UC4GJndVHEhdmqLFBHOCi97A', 
'UCiH4auDlkM0tgn9ewT3B1Vw']
top_history_channels = [
'UC9MAhZQQd9egwWCxrwSIsJQ', 
'UClfEht64_NrzHf8Y0slKEjw', 
'UC510QYlOlKNyhy_zdQxnGYw',
'UCNIuvl7V8zACPpTmmNIqP2A', 
'UCggHoXaj8BQHIiPmOxezeWA',
'UC88lvyJe7aHZmcvzvubDFRg',
'UCodbH5mUeF-m_BsNueRDjcw',
'UCv_vLHiWVBh_FR9vbeuiY-A',
'UCx-dJoP9hFCBloY9qodykvw',
'UCHdluULl5c7bilx1x1TGzJQ']
top_science_channels = [
'UCC552Sd-3nyi_tk2BudLUzA',
'UCsXVk37bltHxD1rDPwtNM8Q', 
'UC6107grRI4m0o2-emgoDnAA', 
'UCUHW94eEFW7hkUMVaZz4eDg', 
'UC06E4Y_-ybJgBUMtXx8uNNw', 
'UCmQXOAse-VnzuXHebX5I77g', 
'UCxo8ooAqXiObjuaIy10ud0A', 
'UCvJiYiBUbw4tmpRSZT2r1Hw', 
'UCJcycnanWtyOGcz34jUlYZA',
'UC9uD-W5zQHQuAVT2GdcLCvg']
top_news_channels = [
'UCn8zNIfYAQNdrFRrr8oibKw',
'UCttspZesZIDEwwpVIgoZtWQ',
'UCfwx98Wty7LhdlkxL5PZyLA',
'UCupvZG-5ko_eiXAupbDfxWw',
'UCLXo7UDZvByw2ixzpQCufnA',
'UCE2606prvXQc_noEqKxVJXA',
'UC9k-yiEpRHMNVOnOi_aQK8w',
'UCBi2mrWuNuyYy4gbM6fU18Q',
'UC1yBKRuGpC1tSM73A0ZjYjQ',
'UC16niRr50-MSBwiO3YDb3RA']
top_music_channels = [
'UC0C-w0YjGpqDXGB8IHb662A',
'UCfM3zsQsOnfWNUppiycmBuw',
'UCYvmuw-JtVrTZQ-7Y4kd63Q',
'UCqECaJ8Gagnn7YCbPEzWH6g',
'UCb2HGwORFBo94DmRx4oLzow',
'UC9CoOnJkIBMdeijd9qYoT_g',
'UCpDJl2EmP7Oh90Vylx0dZtA',
'UCa10nxShhzNrCE1o2ZOPztg',
'UCoUM-UJ7rirJYP8CQ0EIaHA',
'UCEdvpU2pFRCVqU6yIPyTpMQ']
top_comedy_channels = [
'UCY30JRSgfhYXA6i6xX1erWg',
'UCV9_KinVpV-snHe3C3n1hvA',
'UC9gFih9rw0zNCK3ZtoKQQyA',
'UC8-Th83bH_thdKZDJCrn88g',
'UCxSz6JVYmzVhtkraHWZC7HQ',
'UCfm4y4rHF5HGrSr-qbvOwOg',
'UCPDis9pjXuqyI7RYLJ-TTSA',
'UCB0d0JLn1WcGYcwwZ87d2LA',
'UCPDXXXJj9nax0fr0Wfc048g',
'UCi9cDo6239RAzPpBZO9y5SA']
top_travel_channels = [
'UCHJuQZuzapBh-CuhRYxIZrg',
'UCyEd6QBSgat5kkC6svyjudA',
'UCdPambxHRj0kdFPNoJFM98A',
'UCXsQlHGuoWqukC9vz-uonrg',
'UCd5xLBi_QU6w7RGm5TTznyQ',
'UCGaOvAFinZ7BCN_FDmw74fQ',
'UC8hI77bH0VraIw6p2PHwivQ',
'UC_ptyMRLOsS1Uj0a34a_xCA',
'UCJsSEDFFnMFvW9JWU6XUn0Q',
'UCchgIh8Tc4sTmBfnMQ5pDdg']


In [9]:
topicId_list = {
'/m/04rlf': 'Music',
'/m/05fw6t':   "Children's music",
'/m/02mscn':   "Christian music",
'/m/0ggq0m':   "Classical music",
'/m/01lyv':   "Country",
'/m/02lkt':   "Electronic music",
'/m/0glt670':    "Hip hop music",
'/m/05rwpb':   "Independent music",
'/m/03_d0':    "Jazz",
'/m/028sqc':   "Music of Asia",
'/m/0g293':   "Music of Latin America",
'/m/064t9':    "Pop music",
'/m/06cqb':   "Reggae",
'/m/06j6l':   "Rhythm and blues",
'/m/06by7':   "Rock music",
'/m/0gywn':   "Soul music",
'/m/0bzvm2':    "Gaming",
'/m/025zzc':    "Action game",
'/m/02ntfj':    "Action-adventure game",
'/m/0b1vjn':    "Casual game",
'/m/02hygl':    "Music video game",
'/m/04q1x3q':    "Puzzle video game",
'/m/01sjng':   "Racing video game",
'/m/0403l3g':    "Role-playing video game",
'/m/021bp2':    "Simulation video game",
'/m/022dc6':    "Sports game",
'/m/03hf_rm':    "Strategy video game",
'/m/06ntj':    "Sports",
'/m/0jm_':   "American football",
'/m/018jz':    "Baseball",
'/m/018w8':    "Basketball",
'/m/01cgz':    "Boxing",
'/m/09xp_':    "Cricket",
'/m/02vx4':    "Football",
'/m/037hz':    "Golf",
'/m/03tmr':    "Ice hockey",
'/m/01h7lh':    "Mixed martial arts",
'/m/0410tth':    "Motorsport",
'/m/066wd':    "Professional wrestling",
'/m/07bs0':    "Tennis",
'/m/07_53':   "Volleyball",
'/m/02jjt':    "Entertainment",
'/m/095bb':    "Animated cartoon",
'/m/09kqc':    "Humor",
'/m/02vxn':    "Movies",
'/m/05qjc':    "Performing arts",
'/m/019_rr':    "Lifestyle",
'/m/032tl':    "Fashion",
'/m/027x7n':    "Fitness",
'/m/02wbm':    "Food",
'/m/0kt51':    "Health",
'/m/03glg':    "Hobby",
'/m/068hy':    "Pets",
'/m/041xxh':    "Physical attractiveness [Beauty]",
'/m/07c1v':    "Technology",
'/m/07bxq':    "Tourism",
'/m/07yv9':    "Vehicles",
'/m/01k8wb':    "Knowledge",
'/m/098wr':    "Society"
}

In [12]:
random_channel_df = channel_information_grabber(random_channels, 'random_channels')

random_channel_df.to_csv('random_channels.csv', index=False)

In [33]:
channel_information_grabber(top_channels, 'top_channels')
channel_information_grabber(top_news_channels, 'top_news_channels')
channel_information_grabber(top_music_channels, 'top_music_channels')
channel_information_grabber(top_comedy_channels, 'top_comedy_channels')
channel_information_grabber(top_travel_channels, 'top_travel_channels')
channel_information_grabber(top_cooking_channels, 'top_cooking_channels')
channel_information_grabber(top_education_channels, 'top_education_channels')
channel_information_grabber(top_fitness_channels, 'top_fitness_channels')
channel_information_grabber(top_history_channels, 'top_history_channels')
channel_information_grabber(top_science_channels, 'top_science_channels')



Unnamed: 0,id,title,description,customUrl,publishedAt,thumbnails.default.url,defaultLanguage,viewCount,subscriberCount,videoCount,topicIds,topicCategories,relatedPlaylists.uploads,category_title
1,UCsXVk37bltHxD1rDPwtNM8Q,Kurzgesagt – In a Nutshell,Animation videos explaining things with optimi...,@kurzgesagt,2013-07-09 20:17:20+00:00,https://yt3.ggpht.com/ytc/AMLnZu8giocL5QtOHe8h...,,2129399806,19500000,166,"[/m/02vxn, /m/098wr, /m/01k8wb, /m/02jjt, /m/0...","[https://en.wikipedia.org/wiki/Film, https://e...",UUsXVk37bltHxD1rDPwtNM8Q,top_science_channels
0,UCC552Sd-3nyi_tk2BudLUzA,AsapSCIENCE,Making science make sense.\n\nCreated by:\n\nM...,@asapscience,2012-05-28 17:33:43+00:00,https://yt3.ggpht.com/ytc/AMLnZu8txaNXV0qVSyaV...,,1790254768,10300000,445,"[/m/01k8wb, /m/019_rr]","[https://en.wikipedia.org/wiki/Knowledge, http...",UUC552Sd-3nyi_tk2BudLUzA,top_science_channels
2,UC6107grRI4m0o2-emgoDnAA,SmarterEveryDay,I explore the world using science. That's pre...,@smartereveryday,2006-04-04 04:12:24+00:00,https://yt3.ggpht.com/ytc/AMLnZu8B3Q1hLo0DTXMl...,,1074535210,10600000,359,"[/m/019_rr, /m/03glg, /m/07c1v]",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,UU6107grRI4m0o2-emgoDnAA,top_science_channels
7,UCvJiYiBUbw4tmpRSZT2r1Hw,Science Channel,"Learn about outer space, leading scientific ex...",@sciencechannel,2006-12-04 10:29:00+00:00,https://yt3.ggpht.com/VEx_GysV1ngnO_e2l-r2fiR8...,,1020971516,4400000,1908,"[/m/01k8wb, /m/019_rr, /m/0f2f9, /m/02jjt]","[https://en.wikipedia.org/wiki/Knowledge, http...",UUvJiYiBUbw4tmpRSZT2r1Hw,top_science_channels
4,UC06E4Y_-ybJgBUMtXx8uNNw,TheBackyardScientist,"Aloha YouTubers, I am The Backyard Scientist!\...",@thebackyardscientist,2013-04-01 16:41:52+00:00,https://yt3.ggpht.com/ytc/AMLnZu-pMhLCPGuSocGv...,,734126790,5410000,175,"[/m/019_rr, /m/01k8wb, /m/03glg]",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,UU06E4Y_-ybJgBUMtXx8uNNw,top_science_channels
9,UC9uD-W5zQHQuAVT2GdcLCvg,Science Insider,What you want to know about science. A section...,@insiderscience,2015-01-13 21:17:08+00:00,https://yt3.ggpht.com/ytc/AMLnZu8Sc12g4Xy4hHQS...,,542825326,2370000,527,"[/m/01k8wb, /m/0kt51, /m/019_rr]","[https://en.wikipedia.org/wiki/Knowledge, http...",UU9uD-W5zQHQuAVT2GdcLCvg,top_science_channels
3,UCUHW94eEFW7hkUMVaZz4eDg,minutephysics,Simply put: cool physics and other sweet scien...,@minutephysics,2011-06-20 18:20:09+00:00,https://yt3.ggpht.com/ytc/AMLnZu9e3dAiZi5wVlke...,en,513233182,5580000,268,"[/m/01k8wb, /m/019_rr]","[https://en.wikipedia.org/wiki/Knowledge, http...",UUUHW94eEFW7hkUMVaZz4eDg,top_science_channels
8,UCJcycnanWtyOGcz34jUlYZA,IncredibleScience,Experience incredible and cool science experim...,@incrediblescience,2009-10-27 18:33:19+00:00,https://yt3.ggpht.com/ytc/AMLnZu-ePytD00hE3EdC...,,350138531,847000,623,"[/m/01k8wb, /m/019_rr, /m/03glg]","[https://en.wikipedia.org/wiki/Knowledge, http...",UUJcycnanWtyOGcz34jUlYZA,top_science_channels
6,UCxo8ooAqXiObjuaIy10ud0A,Beyond Science,Hi! Welcome to Beyond Science!\nThis is where ...,@beyondscience,2013-10-27 20:31:16+00:00,https://yt3.ggpht.com/ytc/AMLnZu_5eztpR9min996...,,349942608,2040000,480,"[/m/01k8wb, /m/019_rr]","[https://en.wikipedia.org/wiki/Knowledge, http...",UUxo8ooAqXiObjuaIy10ud0A,top_science_channels
5,UCmQXOAse-VnzuXHebX5I77g,Charlie McDonnell,,@charlieissocoollike,2007-04-03 17:20:49+00:00,https://yt3.ggpht.com/j4diJtDd3rfqKiNyhViEuco5...,,0,2120000,0,,,UUmQXOAse-VnzuXHebX5I77g,top_science_channels


In [12]:
# Create a list of all the channels
all_channels = top_channels + top_education_channels + top_cooking_channels + top_fitness_channels + top_history_channels + top_science_channels + top_news_channels + top_music_channels + top_comedy_channels + top_travel_channels

len(all_channels)


110

In [13]:
# get the 50 videos from each channel in all_channels
video_list = []
for channel in all_channels:
    video_list.append(get_50_videos(channel))

# flatten the list
video_list = [item for sublist in video_list for item in sublist]

# convert to csv
df = pd.DataFrame(video_list)
df.to_csv('video_list.csv', index=False)

video_list




['lmH5uqwaFq8',
 '0SY0Yn0yF9o',
 'sNyF7BvVfxs',
 'K4kqqCzF-BA',
 'gfZmvllWVwY',
 'Ay3m-zZ4KLs',
 'mbPNkDEN3Ps',
 'kSx1tS52ov4',
 'o3eMkWX8E7U',
 'o1YenjwOp-A',
 'hqFfJBOrvHw',
 'QYTYKUMOuwo',
 'biXYtxZnChQ',
 'tkTYVy5sdIg',
 '-4nWoQwSHGE',
 'zhoVe42Pxb0',
 'lkBlyggtYhA',
 '3rKIRUm5oV4',
 '3QFTTxD7SuY',
 'DqhH2OUOl1M',
 'SSe98txCv00',
 'uHJN15kVTLY',
 'zHvKlszHzCg',
 'dxjciWnjd0M',
 '3LEW8jG-Q5Y',
 'usqngNJYnug',
 '3e_KEVMUvuo',
 'NSlWgxDFbe8',
 'f_8Bce7Y_p4',
 'G_MubFgSHdI',
 '-BztdXchohU',
 'tztm8t46_hI',
 'ep2qRkyIjek',
 'GbRm5DFfl88',
 'Xxa-wbT93QU',
 'Xf4XemBwzDY',
 'dfRBIE3AE34',
 'LA2q3QwhG54',
 'FGcXy0MZ3Ys',
 'iWwDq7-TCo4',
 'j7DH5splyoY',
 'U5Inxa3jK0Y',
 'Zre4vqUAgHU',
 'hqzvHfy-Ij0',
 'LCwxpu1uV60',
 'Lig3cP174Jo',
 'uCNLiwNFvuc',
 'wwosJXWGWRY',
 'TF-RmVyoEbM',
 'FtAZUKtC0Fk',
 '8XR9OzAeDqQ',
 'LzCA5zHayyk',
 'UhjjjqmGAkI',
 'cLFJjMokld8',
 'vwfQq9IA8TA',
 'P8XP4oamvzk',
 'sIw9GwePGKg',
 'KASLwYxrILs',
 'DYdrFALAlZQ',
 'JDsuYy1qsiA',
 'BQ35bZAPUnk',
 'fJ2_06vAZkw',
 'qQiwgE

In [14]:
len(video_list)

5420

In [14]:
# get the 50 videos from each channel in random_channels
video_list = []
for channel in random_channels:
    video_list.append(get_50_videos(channel))

# flatten the list
video_list = [item for sublist in video_list for item in sublist]

# convert to csv
random_df = pd.DataFrame(video_list)
random_df.to_csv('random_video_list.csv', index=False)

video_list


['YiqtVoky35M',
 'T9_h8ImN7rI',
 'UpvvygeZm58',
 '0LqgbbdcQlw',
 '7kk1TFekZ_w',
 'Zt-SrA1PKv4',
 'So2asp_-AXM',
 'R7EInXxh62k',
 'p25S5apc0sc',
 '8guqewzJgYU',
 '9iYBPpH9O1c',
 'Hc3DofIbxfA',
 'jpP4yfvQUpQ',
 'NgmS-WwTbGg',
 'rQ58tz2HER4',
 '_nwO2rLcU-Y',
 'QfJoKGeNlPU',
 '5m0dwsp1jl8',
 'T3XxsKFM0wk',
 'ipI9Bi4tqT4',
 'kOO0ddq9wBE',
 'BaqskyXzzWY',
 'SAfLN4yIFFU',
 'o2qAs-w3pVw',
 'GRruBFshwxU',
 'J6Xt_KbQK0A',
 'SJfYPj5WRkU',
 'GCzfeBjusAs',
 '-uM-kdkm3Q8',
 'lDpXGU077UE',
 'KtAu8wOl2GU',
 '9MPZTVIbZbA',
 '16A-iZ9WnKw',
 'Hp2gyG7L56s',
 '8P8ZJJHSNS8',
 'YGrv57UDZK0',
 'Z3-OmZtsYJo',
 '6frEsF6Iugc',
 'jrl9W8cXrng',
 'nPwyWbFbw1o',
 'q9rLOpiBqzY',
 'i_-zmTDhFiQ',
 '1FSC8UameIc',
 'CDnTZ6kJT90',
 'f1Y9JplqNys',
 'F5kUqW46-Vw',
 'wmZwE8_SQJg',
 'PNUdnUX-roo',
 '7SP1n1NOVwg',
 'OrU4LxclyUI',
 'mP7xwwuWRiU',
 'M_cWfT0G7lQ',
 'NwSP977itdM',
 'RAAGS885iEk',
 'ScjpcLAhIzI',
 'JaISpQeBUiI',
 'QID_58LSjPM',
 'hGg0HjcoP9w',
 '1LmE2mb1VaA',
 'AsRvjMiR2mQ',
 '6LFBDlJrKfM',
 'UTvR-dU0SXk',
 'biPJuu

In [15]:
len(video_list)

3273

In [17]:
# get information about each video

video_df = pd.DataFrame(columns=[
    'channel_id',
    'video_title', 'video_title_clean', 'video_id', 'published', 'video_views', 'video_madeforkids', 
    'video_likes', 'video_comment_count', 'video_length', 'video_description', 'video_tags'])

# get information about each video
for video in video_list:
    video_df = video_df.append(video_details_grabber(video), ignore_index=True)

video_df





Unnamed: 0,channel_id,video_title,video_title_clean,video_id,published,video_views,video_madeforkids,video_likes,video_comment_count,video_length,video_description,video_tags
0,UCEGGyGmo0NbAPmw1zVNdXbg,Python String Index Function #shorts #python #...,Python String Index Function shorts python pro...,YiqtVoky35M,2022-11-03T16:29:11Z,84,False,1,0,51,Python String Index Function #shorts #python...,"[python, python for beginners, python programm..."
1,UCEGGyGmo0NbAPmw1zVNdXbg,Python Split Function - Get last word of Stri...,Python Split Function Get last word of String...,T9_h8ImN7rI,2022-11-03T16:27:53Z,100,False,5,0,1:,Python Split Function - Get last word of Stri...,"[python, python for beginners, python programm..."
2,UCEGGyGmo0NbAPmw1zVNdXbg,Python unpacking operator * #shorts #python #p...,Python unpacking operator shorts python progra...,UpvvygeZm58,2022-11-03T15:43:52Z,46,False,2,0,1:,Python unpacking operator * #shorts #python \...,"[python, python for beginners, python programm..."
3,UCEGGyGmo0NbAPmw1zVNdXbg,Python using * with strings and numbers #short...,Python using with strings and numbers shorts p...,0LqgbbdcQlw,2022-11-03T15:40:05Z,96,False,2,0,58,Python using * with strings #shorts #python ...,"[python, python for beginners, python programm..."
4,UCEGGyGmo0NbAPmw1zVNdXbg,Python Multiply String with Number #shorts #py...,Python Multiply String with Number shorts pyth...,7kk1TFekZ_w,2022-11-03T15:36:12Z,36,False,0,0,33,Python Multiply String with Number #shorts #...,"[python, python for beginners, python programm..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3268,UCC7jlYxfWti7WAW8r7ef1RQ,In Paris use the Navigo pass like locals and s...,In Paris use the Navigo pass like locals and s...,xRi8SGcRDOY,2022-03-28T15:00:24Z,311,False,4,0,9,,
3269,UCC7jlYxfWti7WAW8r7ef1RQ,How to save money in Paris by using the Paris ...,How to save money in Paris by using the Paris ...,C-34pIsWZPk,2022-03-27T15:00:04Z,4642,False,166,79,4:49,"We all want to save money while traveling, so ...","[how to save money in paris, using the paris m..."
3270,UCC7jlYxfWti7WAW8r7ef1RQ,Magical Malta should 100% be on your bucket li...,Magical Malta should 100 be on your bucket lis...,L2GdB1gB1ZM,2022-03-26T15:00:09Z,5091,False,133,7,16,Save for your bucket list! Malta is a must vis...,
3271,UCC7jlYxfWti7WAW8r7ef1RQ,Is France on your bucket list? #shorts #france...,Is France on your bucket list shorts france tr...,89IewFGQQ6E,2022-03-25T18:00:03Z,31,False,1,0,16,Is France on your bucket list? Be sure to slow...,


In [18]:
# export to csv
video_df.to_csv('random_video_df.csv', index=False)


In [21]:
# import the video_df_success.csv
video_df = pd.read_csv('video_df_success.csv')

# drop null columns
video_df = video_df.dropna(axis=1, how='all')

print(video_df.columns)
print(video_df.shape)
print(video_df.describe())


Index(['video_id', 'channel_id', 'video_title', 'video_title_clean',
       'published', 'video_views', 'video_madeforkids', 'video_likes',
       'video_comment_count', 'video_length', 'video_description',
       'video_tags'],
      dtype='object')
(5420, 12)
        video_views   video_likes  video_comment_count
count  5.420000e+03  5.420000e+03          5420.000000
mean   6.159120e+06  1.385938e+05          6369.311624
std    3.796450e+07  5.579684e+05         26269.721757
min    0.000000e+00  0.000000e+00             0.000000
25%    6.177075e+04  1.663000e+03            69.000000
50%    3.383950e+05  1.104450e+04           447.500000
75%    2.354940e+06  5.700050e+04          2363.250000
max    2.073001e+09  1.451987e+07        728578.000000


In [27]:


# convert the published column to datetime
video_df['published'] = pd.to_datetime(video_df['published'])

# convert the video_length column to datetime
#video_df['video_length'] = pd.to_datetime(video_df['video_length'])

# convert the video_madeforkids column to boolean
video_df['video_madeforkids'] = video_df['video_madeforkids'].astype(bool)

# convert the video_views column to integer
video_df['video_views'] = video_df['video_views'].astype('int64')

# convert the video_likes column to integer
video_df['video_likes'] = video_df['video_likes'].astype('int64')

# convert the video_comment_count column to integer
video_df['video_comment_count'] = video_df['video_comment_count'].astype('int64')

#check datatypes
video_df.dtypes



video_id                            object
channel_id                          object
video_title                         object
video_title_clean                   object
published              datetime64[ns, UTC]
video_views                          int64
video_madeforkids                     bool
video_likes                          int64
video_comment_count                  int64
video_length                        object
video_description                   object
video_tags                          object
dtype: object

In [28]:
# export the cleaned csv
video_df.to_csv('video_df_cleaned.csv', index=False)