In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium_stealth import stealth

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.neural_network import MLPClassifier

from nltk.stem.porter import *
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize

import pyyoutube
import time
import random
import string
import json
import pafy
import re
import pickle

from scipy import stats
from collections import defaultdict
from tqdm import notebook
from IPython.display import clear_output
from youtube_transcript_api import YouTubeTranscriptApi

import pandas as pd
import numpy as np

# Hide video unavailable warnings
import warnings
warnings.filterwarnings("ignore")

# YouTube API

In [2]:
keys = ["AIzaSyBHonuF98PzbYOg7Z1ZFnlAaHjl0Gh3Kjg",  
        "AIzaSyDvaaNTMomMcvGwcz-TrvdrgTlvk4TDAeg", 
        "AIzaSyDvD8rnCKonVOnWAtZCfAu22svlgY9dsuU",
        "AIzaSyA1tCsmnGtTrNLDW_SKyWkArihc3o-bCho",
        "AIzaSyDvk4LR8GYYEMtuKwCQWcVWgaBnY2ftW8A"]

In [3]:
api = pyyoutube.Api(api_key=keys[1])

In [4]:
def update_key(api, key_list):
    current = key_list.index(api._api_key)
    print("Updating API key...")
    
    while current < len(key_list):
        api = pyyoutube.Api(api_key=key_list[current])
        
        try: # see if this key is functional
            api.get_channel_info(channel_id="UC0aanx5rpr7D1M7KCFYzrLQ")
            return api
        except: # if it's not, try the next one
            current += 1
    
    # if no key was functional, exit
    print("No keys remaining...")
    return None

# Data Inladen

In [5]:
mails = pd.read_csv("../data/mails.csv")

In [6]:
videos = pd.read_csv("../data/uploads.csv")

In [7]:
equal = pd.read_csv("../data/training_videos.csv")

In [8]:
# Vectorize words
vectorizer = TfidfVectorizer(max_df=.75, min_df=2)
vector = vectorizer.fit_transform(equal['full_text'])

In [9]:
try:
    mlp = pickle.load(open('MLP_trained.sav', 'rb'))
except:
    # Train
    mlp = MLPClassifier(hidden_layer_sizes=[10]*10, activation="identity",
                        random_state=0).fit(X, Y)

    # save the model to disk
    pickle.dump(mlp, open('MLP_trained.sav', 'wb'))

# Helpers

In [10]:
def process_duration(string):
    """Converts a string of time to seconds"""    
    length = string.split("PT")[1]
    
    length = length.replace("H", "*3600+")
    length = length.replace("M", "*60+")
    length = length.replace("S", "*1+")
    length += "0"
    
    return eval(length)

In [11]:
def vid_to_watch(videos, vectorizer, vector, equal, api, 
                 usertype = 1, experiment_part = 1, user = 0, vid_num = 0):
    """Finds a video to watch and makes sure 
    it's still accessible and under 2 hours long"""
        
    if usertype == 1 or experiment_part == 2:
        # Choose a non-conspiracy video to watch
        to_watch = np.random.choice(videos[~videos["conspiracy"]]["video_id"])
    elif usertype == 2:
        # Choose a conspiracy video to watch
        to_watch = np.random.choice(videos[videos["conspiracy"]]["video_id"])
    elif usertype == 3 or usertype == 4: # Manually checked
        to_watch, vid_len, api = get_similar_video(user, api, equal, vector, vectorizer, vid_num)  
    
    if usertype < 3:
        # Check its length
        url = f"http://www.youtube.com/watch?v={to_watch}"
        try:
            video = pafy.new(url);
            vid_len = video.length
        except: # Video no longer accessible
            vid_len = 9999
    
        # Videos over 1 hour will be skipped
        while vid_len > 3600:
            if usertype == 1 or experiment_part == 2:
                to_watch = np.random.choice(videos[~videos["conspiracy"]]["video_id"])
            else:
                to_watch = np.random.choice(videos[videos["conspiracy"]]["video_id"])

            url = f"http://www.youtube.com/watch?v={to_watch}"
            try:
                video = pafy.new(url);
                vid_len = video.length
            except:
                vid_len = 9999
            
    # Calculate how much of the video will be watched
    percentage = np.random.normal(0.55, 0.25)
    
    # Make sure a video is watched at most 100%
    if percentage > 1:
        percentage = 1
    elif percentage < 0.1:
        percentage = 0.1
    
    watch_time = percentage * vid_len
    
    # Conspiracy videos might be longer than an hour
    # So they should be watched at max 1 hour
    if watch_time > 3600:
        watch_time = 3600

    return watch_time, to_watch

In [25]:
def get_similar_video(user, api, equal, vector, vectorizer, vid_num):
    """Gets a video similar video to the one designated to this user."""
        
    # Find the original video to watch
    initial = ["eCR8LMeTJok", "Zmy5ehd645g", "hl11KmzWaVg", "NfaqZNLcpPQ", "Y4lwsqG6XOg"]
    
    idx = (user - 11) % 5
    vid_id = initial[idx] 
    starting_vid = equal.query("video_id == @vid_id")          
        
    # Find its neighbors
    knn = NearestNeighbors(n_neighbors=5, metric='cosine')
    knn.fit(vector)
    
    # Get the nth nearest neighbor (n = vid_num)
    trans = vectorizer.transform(starting_vid["full_text"].values)    
    nearest = knn.kneighbors(trans, n_neighbors=5, return_distance=False)
    vid_id = equal.iloc[nearest[0][vid_num]]["video_id"]
    
    # Find its duration
    try:
        dct = api.get_video_by_id(video_id = vid_id).items[0].to_dict()
    except Exception as e:
        if "quota" in str(e): # if the error was caused due to the quota-limit, refresh the key
            api = update_key(api, keys)
            dct = api.get_video_by_id(video_id = vid_id).items[0].to_dict()

    # If the API returned nothing, try using pafy
    if dct:
        vid_len = process_duration(dct["contentDetails"]["duration"])
    else:
        url = f"http://www.youtube.com/watch?v={vid_id}"
        try:
            video = pafy.new(url);
            vid_len = video.length
        except:
            vid_len = 600
            
    return vid_id, vid_len, api

In [13]:
def init_driver():
    """Initializes a selenium driver in stealth mode"""
        
    # Add options
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-infobars")

    options.add_argument("start-maximized")
    options.add_argument("--mute-audio")
    options.add_argument('disable-notifications')
    # options.add_argument("--headless")
    options.add_argument("user-agent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'")
    
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    prefs = {"credentials_enable_service" : False, "profile.password_manager_enabled" : False}
    options.add_experimental_option("prefs", prefs)

    # Start driver
    driver = webdriver.Chrome(options=options)

    # Hide the fact we're using a bot
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
        
    return driver

In [14]:
def login_google(driver, mail, password):
    """Logs in on Google given a driver, email-address and password.
    This code only works if the email-address was made within chromedriver itself"""
    
    # login on google
    driver.get("https://accounts.google.com/ServiceLogin")
    time.sleep(np.random.uniform(1, 1.5))

    # Fillin mail
    driver.find_element_by_id("identifierId").send_keys(mail)
    driver.find_element_by_id("identifierNext").click()

    time.sleep(np.random.uniform(1, 1.5))

    if driver.find_elements_by_xpath("//a[contains(@href, 'https://support.google.com/accounts/answer/')]"):
        manual = input(f"Sign in manually for bot {mail}")
        if manual == "no":
            return False
    
    # Fill in password
    driver.find_element_by_xpath("//input[@name = 'password']").send_keys(password)
    driver.find_element_by_xpath('//button[contains(@class, "VfPpkd-LgbsSe")]').click()

    time.sleep(1)

    # If security-check is asked, skip it
    try:
        driver.find_element_by_xpath("//div[contains(@class, 'U26fgb O0WRkf')]").click()
    except:
        pass
    
    driver.get("https://youtube.com")
    time.sleep(1.5)
    
    try:
        driver.find_element_by_xpath("//a[@id = 'return-to-youtube']").click()
    except:
        pass
        
    try:
        driver.find_element_by_xpath("//div[@class = 'VfPpkd-RLmnJb']").click()
    except:
        pass
    
    return True

## Watching the video

In [15]:
def prepare_video(driver):
    """Does everything necessary to start watching a video"""
    
    time.sleep(1)
    
    # Some videos get a warning for inappropriate content
    content_warning = driver.find_elements_by_xpath(
        "//paper-button[text() = 'I understand and wish to proceed']")
    
    if content_warning:
        content_warning[0].click()

    # Skip ad(s) if they are there
    for ad in range(2):
        time.sleep(0.5)
        if driver.find_elements_by_xpath("//img[@class = 'ytp-ad-image']"):
            time.sleep(6)
            try:
                driver.find_element_by_xpath("//div[@class = 'ytp-ad-text ytp-ad-skip-button-text']").click()
            except:
                pass

    # Toggle autoplayer
    time.sleep(0.5)
    try:
        autoplay = driver.find_element_by_xpath("//div[@class = 'ytp-autonav-toggle-button']")
    except:
        pass
    
    if autoplay and autoplay.get_attribute("aria-checked") == "true":
        autoplay.click()
        
    pause = driver.find_element_by_xpath("//button[contains(@class, 'ytp-play-button')]")
    
    if pause.get_attribute("aria-label") == "Play (k)":
        pause.click()

In [16]:
def in_seconds(timestamp):
    """Converts a YouTube timestamp to a number of seconds"""
    # Order times in ascending order
    times = timestamp.split(":")[::-1]
    
    # Convert to seconds
    return sum([int(times[i]) * 60**i for i in range(len(times))])

In [35]:
def check_video_running(driver, watch_time, vid_nr, n_vids, uid):
    """Checks if a video is still running"""
    # Make sure the settings are folded out so that the timestamp is visible
    settings = driver.find_element_by_xpath("//button[contains(@class, 'settings')]")
    if not settings.get_attribute("aria-expanded"):
        try:
            settings.click()
        except:
            pass
    
    timestamp = driver.find_element_by_class_name("ytp-time-current").text
    
    clear_output(wait=True)
    print(f"User : {uid}\nVideo: {vid_nr}/{n_vids}\nTime : {in_seconds(timestamp)}/{int(watch_time)}")
    
    # Sometimes the timestamp cannot be displayed
    if timestamp:   
        return in_seconds(timestamp) <= watch_time - 2
    
    return True

## Strategy 3 and 4

In [36]:
def zipf(alpha, N):
    """Calculates Zipf distribution given variables"""
    denominator = sum([1/n**alpha for n in range(1, N + 1)])
    return [(1/k**alpha)/denominator for k in range(1, N + 1)]

In [37]:
def translate_text(text):
    # Detect language
    try:
        lang = detect(text)
    except:
        lang = "en"
    # If the text is not in English, translate it, otherwise just return
    if lang != "en":
        #The translator only handles texts of less than 5000 characters, so we have to split the text
        if len(text) >= 5000:
            text_split = [text[start:start+4999] for start in range(0, len(text), 4999)]
            try:
                return ''.join([GoogleTranslator(source="auto", target='en').translate(txt) for txt in text_split])
            # Deal with connections stutters
            except Exception as e:
                print(e)
                # Wait three seconds to let the connection stablize
                time.sleep(3)
                # Try again, and if it still doesn't work, return the plain text
                try:
                    return ''.join([GoogleTranslator(source="auto", target='en').translate(txt) for txt in text_split])
                except:
                    return text
                
        else:
            # Deal with connection issues
            try:
                return GoogleTranslator(source=lang, target="en").translate(text)
            except Exception as e:
                print(e)
                time.sleep(3)
                try:
                    return GoogleTranslator(source=lang, target="en").translate(text)
                except:
                    return text
    
    else:
        return text

In [38]:
def stem_text(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    stripped = [word.strip(string.punctuation).lower() for word in word_tokenize(text) if word not in stop_words]
    return " ".join([stemmer.stem(word) for word in stripped if word])

In [90]:
def get_top_conspiracy(recommendations, api, vectorizer, mlp, watched_videos):
    """Finds the most conspiracy-like video out of all recommendations"""
    
    print("Finding top conspiracy video...")
    
    langs = ("en", "nl", "af", "sq", "de", "am", "ar","hy","az","eu","be","bn","my","bs","bg",
             "ca","ceb","zh-Hant","zh-Hans","co","da","en","eo","et","fil","fi","fr","fy","gl",
             "ka","el","gu","ht","ha","haw","iw","hi","hmn","hu","ga","ig","is","id","it","ja",
             "jv","yi","kn","kk","km","rw","ky","ku","ko","hr","lo","la","lv","lt","lb","mk","mg",
             "ml","ms","mt","mi","mr","mn","ne","no","ny","or","ug","uk","uz","ps","fa","pl","pt",
             "pa","ro","ru","sm","gd","sr","sn","sd","si","sl","sk","su","so","es","sw","tg","ta",
             "tt","te","th","cs","tk","tr","ur","vi","cy","xh","yo","zu","st","sv")
    rec_text = defaultdict(list)
    indeces = []

    # Get text for all recommendations
    for i in recommendations:
        # Find video id
        try:
            vid_id = i.split("?v=")[1].split("&t=")[0]
        except:
            continue
        # Find title and description
        try:
            vid_dict = api.get_video_by_id(video_id = vid_id).items[0].to_dict()
        except Exception as e:
            if "quota" in str(e): # if the error was caused due to the quota-limit, refresh the key
                api = update_key(api, keys)
                vid_dict = api.get_video_by_id(video_id = vid_id).items[0].to_dict()
                        
        if vid_dict["snippet"]["title"]:
            title = vid_dict["snippet"]["title"]
        else:
            title = ""
        
        if vid_dict["snippet"]["description"]:
            desc = vid_dict["snippet"]["description"]
        else:
            desc = ""
            
        # Find channel keywords and description
        try:
            chnl = api.get_channel_info(channel_id=vid_dict["snippet"]["channelId"]).items[0].to_dict()
        except Exception as e:
            if "quota" in str(e): # if the error was caused due to the quota-limit, refresh the key
                api = update_key(api, keys)
                chnl = api.get_channel_info(channel_id=vid_dict["snippet"]["channelId"]).items[0].to_dict()
        
        if chnl["snippet"]["description"]:
            channel_desc = chnl["snippet"]["description"]
        else:
            channel_desc = ""
            
        if chnl["brandingSettings"]["channel"]["keywords"]:
            channel_keywords = chnl["brandingSettings"]["channel"]["keywords"]
        else:
            channel_keywords = ""
            
        try:
            transcript = " ".join([i["text"] for i in YouTubeTranscriptApi.get_transcript(vid_id, languages=langs)])
        except Exception as e:
            transcript = ""
        
        indeces.append(vid_id)
        rec_text["full_text"].append(title + " " + desc + " " + transcript\
                                     + " " + channel_desc + " " + channel_keywords)
        
        rec_text["duration"].append(vid_dict["contentDetails"]["duration"])
        
    # Clean text
    rec_text = pd.DataFrame(rec_text, index=indeces)
    rec_text["full_text"] = rec_text["full_text"].apply(lambda txt: translate_text(txt))
    rec_text["full_text"] = rec_text["full_text"].apply(lambda text: text.encode("ascii", "ignore").decode())
    rec_text["full_text"] = rec_text["full_text"].apply(lambda text: stem_text(text))    
        
    # Vectorize
    tf_idf = vectorizer.transform(rec_text["full_text"].values)
    predictions = mlp.predict_proba(tf_idf)
    
    rec_text["non_con_odds"] = predictions[:,0]
    rec_text["con_odds"] = predictions[:,1]
        
    # Most conspiracy-like video
    top_conspiracy = rec_text.sort_values(by="con_odds", ascending=False).iloc[0]
    
    # Get video id and length of the video
    to_watch = top_conspiracy.name
    
    watched_ids = []
    
    for i in watched_videos:
        try:
            v_id = i.split("&t=")[0].split("?v=")[1]
        except:
            v_id = ""
            
        watched_ids.append(v_id)
    
    # Make sure the top conspiracy recommendation is a new video
    i = 1
    while to_watch in watched_ids:
        # Most conspiracy-like video
        top_conspiracy = rec_text.sort_values(by="con_odds", ascending=False).iloc[i]
        
        # Get video id and length of the video
        to_watch = top_conspiracy.name
        
        i += 1        
    
    vid_len = process_duration(top_conspiracy["duration"])
    
    print(f"Found top conspiracy video: {to_watch} ({top_conspiracy['con_odds']})")
    time.sleep(2)
    
    return vid_len, to_watch, api, top_conspiracy["con_odds"], rec_text

In [40]:
def choose_recommendation(recommendations, api, vectorizer, mlp, watched_videos):
    """Selects a video to watch based on a top 20 of given recommendations"""
    
    # Find most likely conspiracy video
    vid_len, to_watch, api, con_odds = get_top_conspiracy(recommendations, api, vectorizer, mlp, watched_videos)
        
    # If it most likely is not a conspiracy video, pick a recommendation
    if con_odds < 0.25:       
        # Choose a video according to the zipf-distribution of video-selection (Zhou et al., 2010)    
        odds = zipf(0.78, len(recommendations))    
        choice = np.random.choice(recommendations, p = odds)   

        vid_len = pafy.new(choice).length

        # Make sure no livestreams are clicked
        while not vid_len or vid_len > 3600:
            choice = np.random.choice(recommendations, p = odds)
            vid_len = pafy.new(choice).length
            to_watch = choice.replace("https://www.youtube.com/watch?v=", "")
        
    # Calculate how much of the video will be watched
    percentage = np.random.normal(0.55, 0.25)
    
    # Make sure a video is watched at most 100%
    if percentage > 1:
        percentage = 1
    elif percentage < 0.1:
        percentage = 0.1
            
    # Calculate how much of the video will be watched
    watch_time = percentage * vid_len
    
    # Conspiracy videos might be longer than an hour
    # So they should be watched at max 1 hour
    if watch_time > 3600:
        watch_time = 3600
    
    return watch_time, to_watch, api

# Het experiment

In [41]:
def experiment_part_1(videos, vectorizer, vector, equal, user, api, n):
    """Watches YouTube videos on different YouTube accounts in different
       ways. Videos are either retrieved from a dataset or chosen from the
       recommendations. 
       
       videos: the dataset containing videos (labeled as conspiracy True/False)
       user: a dataset with YouTube-accounts (mail, password, usertype)
       n: the number of videos to be watched during the experiment
       
       returns two dataframes: one containing the homepage recommendations after
       each watched video, and one containing all the videos that have been watched.""" 
        
    # Initialize variables
    experiment_results = defaultdict(list)
    videos_per_user = defaultdict(list)
    crashes = defaultdict(list)
    rec_to_watch = None

    # Find current user's info
    uid, mail, password, usertype = user[0], user[1], user[2], user[3]
    
    # Initialize driver
    driver = init_driver()    

    # Login to google
    logged_in = login_google(driver, mail, password)
    
    # If login was successful, start watching videos
    if logged_in:
        # Start watching videos
        for i in range(1, n + 1):
            try:
                # If the next video is not a recommendation (usertype 1 or 2)
                if not rec_to_watch:
                    # Get a random video from the dataset and watch it                
                    watch_time, to_watch = vid_to_watch(videos, vectorizer, vector, equal, api, usertype = usertype,
                                                        experiment_part = 1, user = uid, vid_num = i - 1)

                    driver.get(f"http://youtube.com/watch?v={to_watch}")
                    vid_running = True
                else: # If it is one of the recommendations (usertype 3 or 4)
                    # Watch the recommendation
                    driver.get(f"http://youtube.com/watch?v={rec_to_watch}")
                    vid_running = True

                # Skip ads, disable autoplay
                prepare_video(driver)

                # Save information of current video being watched
                videos_per_user["user"].append(mail)
                videos_per_user["video_number"].append(i)
                videos_per_user["url"].append(driver.current_url)     

                # Watch video
                while check_video_running(driver, watch_time, i, n, uid):
                    time.sleep(1)

                # If we have a usertype that relies on direct recommendations
                if usertype == 3 and i >= 5:
                    try:
                        driver.find_element_by_xpath("//button[contains(@class, 'ytp-play-button')]").click()
                    except:
                        pass
                    video_recs = driver.find_elements_by_xpath(
                        "//a[contains(@class, 'ytd-compact-video-renderer')]")                    
                    watch_time, rec_to_watch, api = choose_recommendation([rec.get_attribute("href")
                                                                           for rec in video_recs][:20], 
                                                                           api, vectorizer, mlp,
                                                                           videos_per_user["url"])
                    

                # Go to the youtube homepage
                driver.get("http://youtube.com")
                time.sleep(1.5)

                # Get videos on youtube home
                channels = driver.find_elements_by_xpath("//a[@id = 'avatar-link']")
                vids = driver.find_elements_by_xpath("//a[@id = 'video-title-link']")

                # If we have a user that relies on homepage recommendations
                if usertype == 4 and i >= 5:
                    watch_time, rec_to_watch, api = choose_recommendation([rec.get_attribute("href")
                                                                           for rec in vids][:20],
                                                                           api, vectorizer, mlp,
                                                                           videos_per_user["url"])

                # Get top 20 recommendations
                for rec in range(20):
                    experiment_results["user"].append(mail)
                    experiment_results["vids_watched"].append(i)

                    experiment_results["video"].append(vids[rec].get_attribute("href"))
                    experiment_results["channel"].append(channels[rec].get_attribute("href"))

            except Exception as e: # if the video somehow doesn't get watched
                # Store where the crash happened and why
                crashes["user"].append(user[0])
                crashes["video number"].append(i)
                crashes["reason"].append(repr(e))
    else:
        driver.quit()
        return None, None, None
            
    driver.quit()
    return experiment_results, videos_per_user, crashes

In [48]:
logs_1 = pd.DataFrame(columns=["user", "video number", "reason"])
failures = []

# Go over every user in the csv
for user in mails.itertuples():
    if user[0] == 0:
        # Run experiment part 1 for current user
        recommendations, watched_videos, crashes = experiment_part_1(videos, vectorizer, vector, equal, user, api, 15)

        if recommendations is not None:
            # Save results for current user
            try:
                recommendations = pd.DataFrame(recommendations)
                recommendations.to_csv(f"recommendations_user_{user[0]}.csv")
            except:
                failures.append(f"recommendations_user_{user[0]}.csv")

            try:
                watched_videos = pd.DataFrame(watched_videos)
                watched_videos.to_csv(f"watched_videos_user_{user[0]}.csv")
            except:
                failures.append(f"watched_videos_user_{user[0]}.csv")

        if crashes:
            logs_1 = pd.concat([logs_1, pd.DataFrame(crashes)], ignore_index=True)
                
if len(logs_1):
    logs_1.to_csv("logs_experiment_1.csv")
if failures:
    print(failures)
else:
    print("Script executed without errors!")

User : 0
Video: 6/15
Time : 4/86


KeyboardInterrupt: 

In [22]:
def experiment_part_2(videos, user, n):
    """Watches non-conspiracy videos on accounts in filter bubbles to see
       how quickly they can escape the bubble. 
   
       videos: the dataset containing videos (labeled as conspiracy True/False)
       user: a dataset with YouTube-accounts (mail, password, usertype)
       n: the number of videos to be watched during the experiment
       
       returns two dataframes: one containing the homepage recommendations after
       each watched video, and one containing all the videos that have been watched.""" 
    
    # Initialize variables
    experiment_2_results = defaultdict(list)
    videos_per_user_2 = defaultdict(list)
    crashes_2 = defaultdict(list)

    # Find current user's info
    uid, mail, password = user[0], user[1], user[2]
    
    # Initialize driver
    driver = init_driver()    

    # Login to google
    login_google(driver, mail, password)
    
    # Start watching videos
    for i in range(1, n + 1):
        try:
            # Get a random video from the dataset and watch it
            watch_time, to_watch = vid_to_watch(videos, usertype, experiment_part = 2)
            driver.get(f"http://youtube.com/watch?v={to_watch}")
            vid_running = True

            # Skip ads, disable autoplay
            prepare_video(driver)

            # Save information of current video being watched
            views, likes, dislikes, date, url = get_video_info(driver)

            videos_per_user_2["user"].append(mail)
            videos_per_user_2["video_number"].append(i)
            videos_per_user_2["url"].append(url)
            videos_per_user_2["views"].append(views)
            videos_per_user_2["likes"].append(likes)
            videos_per_user_2["dislikes"].append(dislikes)
            videos_per_user_2["date"].append(date)         
            
            # Watch video
            while check_video_running(driver, watch_time, i, n, uid):
                time.sleep(1)

            # Go to the youtube homepage
            driver.get("http://youtube.com")
            time.sleep(1)

            # Get videos on youtube home
            channels = driver.find_elements_by_xpath("//a[@id = 'avatar-link']")
            vids = driver.find_elements_by_xpath("//a[@id = 'video-title-link']")

            # Get top 20 recommendations
            for rec in range(20):
                experiment_2_results["user"].append(mail)
                experiment_2_results["vids_watched"].append(i)

                experiment_2_results["video"].append(vids[rec].get_attribute("href"))
                experiment_2_results["channel"].append(channels[rec].get_attribute("href"))
                
        except Exception as e: # if the video somehow doesn't get watched
            # Store where the crash happened and why
            crashes_2["user"].append(user[0])
            crashes_2["video number"].append(i)
            crashes_2["reason"].append(repr(e))
            
    driver.quit()         
    return pd.DataFrame(experiment_2_results), pd.DataFrame(videos_per_user_2), pd.DataFrame(crashes_2)

In [None]:
logs_2 = pd.DataFrame(columns=["user", "video number", "reason"])

# Go over every user in the csv
for user in notebook.tqdm(mails.itertuples()):
    # Run experiment part 2
    recommendations_2, watched_videos_2, crashes_2 = experiment_part_2(videos, user, 15)
            
    # Store the results
    recommendations_2.to_csv(f"recommendations_part_2_user_{user[0]}.csv")
    watched_videos_2.to_csv(f"watched_videos_part_2_user_{user[0]}.csv")
    
    if len(crashes):
        logs_2 = pd.concat([logs_2, crashes], ignore_index=True)
    
    break
    
if len(logs_2):
    logs_2.to_csv("logs_experiment_2.csv")
else:
    print("Script executed without errors!")