In [1]:
import pyyoutube
import pandas as pd
from collections import defaultdict
from tqdm import notebook
import requests

from youtube_transcript_api import YouTubeTranscriptApi

In [2]:
keys = ["AIzaSyBHonuF98PzbYOg7Z1ZFnlAaHjl0Gh3Kjg",  
        "AIzaSyDvaaNTMomMcvGwcz-TrvdrgTlvk4TDAeg", 
        "AIzaSyDvD8rnCKonVOnWAtZCfAu22svlgY9dsuU",
        "AIzaSyA1tCsmnGtTrNLDW_SKyWkArihc3o-bCho",
        "AIzaSyDvk4LR8GYYEMtuKwCQWcVWgaBnY2ftW8A"] #roan.schellingerhout, xiodazer, roanspaypal, ronzijnsmurf, veilen

In [3]:
api = pyyoutube.Api(api_key=keys[0])

In [4]:
def update_key(api, key_list):
    current = key_list.index(api._api_key)
    print("Updating API key...")
    
    while current < len(key_list):
        api = pyyoutube.Api(api_key=key_list[current])
        
        try: # see if this key is functional
            api.get_channel_info(channel_id="UC0aanx5rpr7D1M7KCFYzrLQ")
            return api
        except: # if it's not, try the next one
            current += 1
    
    # if no key was functional, exit
    print("No keys remaining...")
    return None

In [5]:
def get_info(api, channel_id):
    """Get general info of a channel"""
    
    results = defaultdict(list)
    
    try:
        # Get the channel info from the API
        channel_by_id = api.get_channel_info(channel_id=channel_id)
        if not channel_by_id: # Channel no longer exists
            print(f"User {channel_id} not found")
            return None, api
    except Exception as e:
        if "quota" in str(e): # if the error was caused due to the quota-limit, refresh the key
            api = update_key(api, keys)
            channel_by_id = api.get_channel_info(channel_id=channel_id)
            if not channel_by_id: # Channel not found, even after key-refresh
                print(f"User {channel_id} not found")
                return None, api
        else: # e had a message, but it was about something else
            print(f"Skipping: {channel_id} due to {e}")
            return None, api
        
    if channel_by_id.items:
        c = channel_by_id.items[0].to_dict()
        results["channel"].append(channel_id)
        results["channel_description"].append(c["snippet"]["description"])
        results["country"].append(c["snippet"]["country"])
        results["keywords"].append(c["brandingSettings"]["channel"]["keywords"])
        results["uploads"].append(c["contentDetails"]["relatedPlaylists"]["uploads"])
                
        return results, api
    else:
        return None, api

In [6]:
get_info(api, "UCsXVk37bltHxD1rDPwtNM8Q")

(defaultdict(list,
             {'channel': ['UCsXVk37bltHxD1rDPwtNM8Q'],
              'channel_description': ['Videos explaining things with optimistic nihilism. \n\nWe are a small team who want to make science look beautiful. Because it is beautiful. \n\nCurrently we make one animation video per month. Follow us on Twitter, Facebook to get notified when a new one comes out.\n\nFAQ:\n \n- We do the videos with After Effects and Illustrator.'],
              'country': ['DE'],
              'keywords': ['Science universe space infographic evolution education funny death life history ebola Fermi'],
              'uploads': ['UUsXVk37bltHxD1rDPwtNM8Q']}),
 <pyyoutube.api.Api at 0x20f808b4f28>)

In [7]:
def get_uploads(api, channel_id, conspiracy, playlist_id, n):
    uploads = defaultdict(list)
    
    try:
        # Get the playlist info from the API
        ups = api.get_playlist_items(playlist_id = playlist_id, count = n)
        if not ups: # Playlist no longer exists
            print(f"User {channel_id} not found")
            return None, api
    except Exception as e:
        if "quota" in str(e): # if the error was caused due to the quota-limit, refresh the key
            api = update_key(api, keys)
            ups = api.get_playlist_items(playlist_id = playlist_id, count = n)
            if not ups: # Channel not found, even after key-refresh
                print(f"User {channel_id[0]} not found")
                return None, api
        else: # e had a message, but it was about something else
            print(f"Skipping: {channel_id[0]} due to an error (no uploads, timeout, etc.)")
            return None, api
        
    
    if ups.items:
        for up in ups.items:
            vid = up.to_dict()           
            uploads["channel"].append(vid["snippet"]["channelId"])
            uploads["video_id"].append(vid["contentDetails"]["videoId"])
            uploads["title"].append(vid["snippet"]["title"])
            uploads["description"].append(vid["snippet"]["description"])
            uploads["conspiracy"].append(conspiracy)

            # Add transcript
            
            text = ""
            
            try:
                text = " ".join([i["text"] for i in\
                                 YouTubeTranscriptApi.get_transcript(vid["contentDetails"]["videoId"])])
            except:
                pass
            
            uploads["transcript"].append(text)
            
            
        return uploads, api
    else:
        return None, api

In [8]:
# get_uploads(api, "UC0aanx5rpr7D1M7KCFYzrLQ", False, "UUsXVk37bltHxD1rDPwtNM8Q", 5)

In [9]:
df = pd.read_csv("../data/dataset_boolean.csv").drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,Channel_name,Conspiracy,Channel_id,LR
0,Simon Parkes,True,UCgzqRRDGThOlH4EHaqSXXPA,R
1,The Stoa,False,UCfI5jzpoUbwP4wkmQ6ZNqbA,C
2,X22 Report,True,UCKXM6nMVQW_VfjwqVAnxvdw,R
3,Ben Burgis,False,UCByZMNYpHFEetI0s3deYH2g,L
4,3D to 5D Consciousness,True,UC3VlH7lPbKlzVda5aJgyUvQ,R


In [10]:
def create_dicts(api, n):
    
    channels_dict = defaultdict(list)
    uploads_dict = defaultdict(list)

    # Go over each channel in the dataset
    for i in notebook.tqdm(df.iterrows(), total=len(df)):
        # Find channel information
        channel, new_api = get_info(api, i[1]["Channel_id"])
        
        # Refresh API if needed
        if api != new_api:
            api = new_api
            
        # If the channel was found, add its information
        if channel:
            for c in channel:
                channels_dict[c].append(channel[c][0])
                
            channels_dict["conspiracy"].append(i[1]["Conspiracy"])
                
            # Find uploads of the given channel
            uploads, new_api = get_uploads(api, channel["channel"], i[1]["Conspiracy"], channel["uploads"], n)
            if api != new_api:
                api = new_api
                   
        # If uploads have been found, add them
        if uploads:
            for j, upload in enumerate(uploads):
                for value in uploads[upload]:
                    uploads_dict[upload].append(value) 
        
    return channels_dict, uploads_dict

In [11]:
channels, uploads = create_dicts(api, 10)

  0%|          | 0/6881 [00:00<?, ?it/s]

Skipping: UCFuuSJn_91xsrANbMd8h67g due to an error (no uploads, timeout, etc.)
Skipping: UCpO8DrZ1mSasrH93Ml-Ay9g due to an error (no uploads, timeout, etc.)
Skipping: UChnbPl3guXu81OXB__HAT-w due to an error (no uploads, timeout, etc.)
Skipping: UCC2Zd5bz-rUnmdsEpkjkXBg due to an error (no uploads, timeout, etc.)
Skipping: UC85FErR-Kn_IX-xrpBpIkMQ due to an error (no uploads, timeout, etc.)
Skipping: UCVJ6bEJChervVShvOTG0qRw due to an error (no uploads, timeout, etc.)
Skipping: UCFvub3YrEUoXl8aPHZBbHqw due to an error (no uploads, timeout, etc.)
Skipping: UCLyV5MJZV-kXunNftlFif1A due to an error (no uploads, timeout, etc.)
Skipping: UCGlLEGoGE1ZUAMQcUFNa8QQ due to an error (no uploads, timeout, etc.)
Skipping: UC2D7vZh77B_dx4-c7o8KGWw due to an error (no uploads, timeout, etc.)
Skipping: UCHAv1g2JODsrkUKfHh1nAwQ due to an error (no uploads, timeout, etc.)
Skipping: UCFpvg5yg5fMda91yiI1owSQ due to an error (no uploads, timeout, etc.)
Skipping: UCv6czyqlTA8Jyvf6m2AxlMA due to an error (

Skipping: UCDWmp9u3xBGwFR1iEIeEyOg due to an error (no uploads, timeout, etc.)
Skipping: UC5LYQ3rerGrxfSfDX7FAWdg due to an error (no uploads, timeout, etc.)
Skipping: UCSwGdg2g8XpyBrXrIpuNQxg due to an error (no uploads, timeout, etc.)
Skipping: UCfsgnppkYXuH9WbOXxUksQg due to an error (no uploads, timeout, etc.)
Skipping: UCOANeOBcvtSOjrDaQgeChBA due to an error (no uploads, timeout, etc.)
Skipping: UCRYK44h6TYZ3rxe-TMskyjA due to an error (no uploads, timeout, etc.)
Skipping: UCLdcgni1mCq1OxUE3HIL8QQ due to an error (no uploads, timeout, etc.)
Skipping: UCFfiBlGEdeZRaZsWV3v55fA due to an error (no uploads, timeout, etc.)
Skipping: UCy2gOV0CFageToTKf_0sCqw due to an error (no uploads, timeout, etc.)
Skipping: UC1KKegXvjrmbA5Jb-vRaFpg due to an error (no uploads, timeout, etc.)
Skipping: UCiU2_P6Jmvym6KXYXTBfwPQ due to an error (no uploads, timeout, etc.)
Skipping: UCsM7VR11bfAH5Dskiq4OBeg due to an error (no uploads, timeout, etc.)
Skipping: UCUwXt2JjBYFykysPD0A6TFw due to an error (

In [12]:
channel_df = pd.DataFrame(channels)

channel_df.to_csv("../data/channels.csv")

In [15]:
uploads_df = pd.DataFrame(uploads)

uploads_df.to_csv("../data/uploads.csv")

In [16]:
import pickle

with open('../data/uploads.pickle', 'wb') as handle:
    pickle.dump(uploads_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
uploads_df

Unnamed: 0,channel,video_id,title,description,conspiracy,transcript
0,UCgzqRRDGThOlH4EHaqSXXPA,Eve8U63jOCA,16th January Update,www.simonparkes.org\n\nTelegram:\nhttps://t.me...,True,hello and just to let you know that i'm doing ...
1,UCgzqRRDGThOlH4EHaqSXXPA,T24xqK23NPk,13th January Update,Bitchute Channel:\nhttps://www.bitchute.com/si...,True,hello and welcome to another update it's the 1...
2,UCgzqRRDGThOlH4EHaqSXXPA,IJAR8FYdDcE,12th January Second Update Current News,To Join Connecting Consciousness:\nhttps://con...,True,hello and just a another quick update just to ...
3,UCgzqRRDGThOlH4EHaqSXXPA,NJej_CUkV8E,11th January Second Update Current News,www.simonparkes.org\n\nTo Join Connecting Cons...,True,hello uh just a very very very quick update it...
4,UCgzqRRDGThOlH4EHaqSXXPA,as9IEvaC9XA,11th January Update Current News,www.simonparkes.org\n\nTo Join Connecting Cons...,True,hello and welcome to a sort of flying update i...
...,...,...,...,...,...,...
65679,UCgRvm1yLFoaQKhmaTqXk9SA,tt6yWP2VWOU,"New Zealand's Push for Gun Reform, Explained |...",New Zealand is making some big changes. Their ...,False,New Zealand is making some big changes. [ARDEN...
65680,UCgRvm1yLFoaQKhmaTqXk9SA,t7pmHD8rqOA,The Rise and Fall of Myanmar’s Aung San Suu Ky...,A non-violent freedom fighter? A war crimes ap...,False,A non-violent freedom fighter? War crimes apo...
65681,UCgRvm1yLFoaQKhmaTqXk9SA,K7pzjFOe7X0,Spain's Plan to Quit the Coal Industry Explain...,Climate change is happening. And one industry ...,False,Climate change is happening. And one industry ...
65682,UCgRvm1yLFoaQKhmaTqXk9SA,KpYxFouaTvk,How Costa Rica Avoided Cold War Violence | Now...,Costa Rica has sometimes been called “the Swit...,False,Costa Rica has sometimes been called “the Swi...
