# YouTube video IDs

This notebook contains code to get unique video IDs from kaggle dataset and/or to generate random ids.

In [1]:
import os
import random
import pandas as pd

# Collect unique video ids from *rsrishav* kaggle dataset

In [2]:
# datasets from
# https://www.kaggle.com/rsrishav/youtube-trending-video-dataset

filenames_all = ['../youtube_kaggle_dataset/data/BR_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/CA_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/DE_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/FR_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/GB_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/IN_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/JP_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/KR_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/MX_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/RU_youtube_trending_data.csv',
                 '../youtube_kaggle_dataset/data/US_youtube_trending_data.csv']

filenames_CA_US_GB = ['../youtube_kaggle_dataset/data/CA_youtube_trending_data.csv',
                      '../youtube_kaggle_dataset/data/GB_youtube_trending_data.csv',
                      '../youtube_kaggle_dataset/data/US_youtube_trending_data.csv']

In [3]:
def collect_unique_video_ids(filenames, sortby='comment_count'):
    """ 
    Return list of unique video_id from all dataframes from 'filenames' list 
    sorted by 'sortby' column. If sortby=False - unsorted.
    """

    list_of_df = []
    for filename in filenames:
        current_df = pd.read_csv(filename)
        list_of_df.append(current_df)

    all_df = pd.concat(list_of_df)
    
    if sortby:
        # returs series
        video_ids = all_df.groupby('video_id')[sortby].max().sort_values(ascending=False)
        return video_ids.index.tolist()
    else:
        return all_df['video_id'].unique().tolist()

In [4]:
def save_ids(ids, filename):
    """ Write ids to text file """
    
    with open(filename, 'w') as f:
        for id in ids:
            f.write(id + '\n')
            
def read_ids(filename):
    """ Read ids from file to list """
    
    with open(filename, 'r') as f:
        ids = f.read().splitlines()

    return ids

In [5]:
# all unique video ids
all_unique_ids = collect_unique_video_ids(filenames_all, 
                                          sortby=False)
print(len(all_unique_ids))
print(all_unique_ids[:10])

save_ids(all_unique_ids, "video_IDs/unique_ids_from_kaggle.txt")

199728
['s9FH4rDMvds', 'jbGRowa5tIk', '3EfkCrXKZNs', 'gBjox7vn3-g', 'npoUGx7UW7o', 'Vu6PNpYKu2U', 'ly8jXKq_9AE', 'QAUqqcEU0Xc', 'eA4FRvf6vdM', '8f70QZQB4UA']


In [6]:
# unique video ids from Great Britain, Canada and USA
# sorted by 'comment_count'
unique_ids_GB_CA_US = collect_unique_video_ids(filenames_CA_US_GB, 
                                               sortby='comment_count')
print(len(unique_ids_GB_CA_US))
print(unique_ids_GB_CA_US[:10])

save_ids(unique_ids_GB_CA_US, "video_IDs/unique_ids_GB_CA_US.txt")

31848
['WMweEpGlu_U', 'gdZLi9oWNZg', '-5q5mZbe3V8', '2L6gsn7rGqI', '2IkoKhr6Tss', 'CuklIb9d3fI', 'vRXZj0DzXIA', 'awkkyBH2zEo', 'zFT3f9biz68', 'CKZvWhCqx1s']


# Generate random Youtube video id

In [7]:
def generate_random_video_id():
    """ 
    Generate random Youtube video id:
    an 11-character string consisting of "0-9a-zA-Z", "-" and "_"
    """
    
    characters = [c for c in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_']
    return "".join(random.choices(characters, k=11))

In [9]:
for _ in range(10):
    print(generate_random_video_id())

GFXQNCkE-hM
mU4dPofU61v
CMhGb5e0rU7
_TnXVBBlc77
SeCzy-EBTA_
8dP33JGWTyH
QZu6jM8j0z8
wlkntKbNRYp
2KS9KR8iL6Q
I3uG7PVxo_2
