In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium_stealth import stealth

import time
import random
import string
import json
import pafy

from scipy import stats
from collections import defaultdict
from tqdm import notebook
from IPython.display import clear_output

import pandas as pd
import numpy as np

# Hide video unavailable warnings
import warnings
warnings.filterwarnings("ignore")

# Data Inladen

In [2]:
mails = pd.read_csv("../data/mails.csv")

In [3]:
videos = pd.read_csv("../data/uploads.csv")

# Helpers

In [48]:
def vid_to_watch(videos, usertype = 1, experiment_part = 1, user = 0):
    """Finds a video to watch and makes sure 
    it's still accessible and under 2 hours long"""
    
    if usertype == 1 or experiment_part == 2:
        # Choose a non-conspiracy video to watch
        to_watch = np.random.choice(videos[~videos["conspiracy"]]["video_id"])
    else:
        if usertype == 2:
            # Choose a conspiracy video to watch
            to_watch = np.random.choice(videos[videos["conspiracy"]]["video_id"])
        else: # Manually checked
            to_watch = ["sK8FAqWBmLw", "Zmy5ehd645g", "bB6KJacrnHk", "QgHkUtks7yE", "gW_xQmlsln4",
                        "4K28xuhtWt8", "-sLVqc7DfR4", "wVhuTg5Pbdo", "v9nNAYVwYFE", "fgL-mFAxQrI"][user - 11]
            
            vid_len = [559, 352, 227, 59, 2760,
                       583, 231, 126, 2796, 257][user - 11]
    
    if usertype < 3:
        # Check its length
        url = f"http://www.youtube.com/watch?v={to_watch}"
        try:
            video = pafy.new(url);
            vid_len = video.length
        except: # Video no longer accessible
            vid_len = 9999
    
    # Videos over 2 hours will be skipped
    while vid_len > 3600:
        if usertype == 1 or experiment_part == 2:
            to_watch = np.random.choice(videos[~videos["conspiracy"]]["video_id"])
        else:
            to_watch = np.random.choice(videos[videos["conspiracy"]]["video_id"])
        
        url = f"http://www.youtube.com/watch?v={to_watch}"
        try:
            video = pafy.new(url);
            vid_len = video.length
        except:
            vid_len = 9999
            
    # Calculate how much of the video will be watched
    percentage = np.random.normal(0.55, 0.25)
    
    # Make sure a video is watched at most 100%
    if percentage > 1:
        percentage = 1
    elif percentage < 0.1:
        percentage = 0.1
    
    watch_time = percentage * vid_len

    return watch_time, to_watch

In [6]:
def init_driver():
    """Initializes a selenium driver in stealth mode"""
        
    # Add options
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-infobars")

    options.add_argument("start-maximized")
    options.add_argument("--mute-audio")
    options.add_argument('disable-notifications')
    options.add_argument("--headless")

    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    prefs = {"credentials_enable_service" : False, "profile.password_manager_enabled" : False}
    options.add_experimental_option("prefs", prefs)

    # Start driver
    driver = webdriver.Chrome(options=options)

    # Hide the fact we're using a bot
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    
    return driver

In [7]:
def login_google(driver, mail, password):
    """Logs in on Google given a driver, email-address and password.
    This code only works if the email-address was made within chromedriver itself"""
    
    # login on google
    driver.get("https://accounts.google.com/ServiceLogin")
    time.sleep(np.random.uniform(1, 1.5))

    # Fillin mail
    driver.find_element_by_id("identifierId").send_keys(mail)
    driver.find_element_by_id("identifierNext").click()

    time.sleep(np.random.uniform(1, 1.5))

    # Fill in password
    driver.find_element_by_xpath("//input[@name = 'password']").send_keys(password)
    driver.find_element_by_xpath('//button[contains(@class, "VfPpkd-LgbsSe")]').click()

    time.sleep(0.5)

    # If security-check is asked, skip it
    try:
        driver.find_element_by_xpath("//div[contains(@class, 'U26fgb O0WRkf')]").click()
    except:
        pass

In [8]:
def prepare_video(driver):
    """Does everything necessary to start watching a video"""
    
    time.sleep(1)
    
    # Some videos get a warning for inappropriate content
    content_warning = driver.find_elements_by_xpath(
        "//paper-button[text() = 'I understand and wish to proceed']")
    
    if content_warning:
        content_warning[0].click()

    # Skip ad(s) if they are there
    for ad in range(2):
        time.sleep(0.5)
        if driver.find_elements_by_xpath("//img[@class = 'ytp-ad-image']"):
            time.sleep(6)
            try:
                driver.find_element_by_xpath("//div[@class = 'ytp-ad-text ytp-ad-skip-button-text']").click()
            except:
                pass

    # Toggle autoplayer
    time.sleep(0.5)
    try:
        autoplay = driver.find_element_by_xpath("//div[@class = 'ytp-autonav-toggle-button']")
    except:
        pass
    
    if autoplay and autoplay.get_attribute("aria-checked") == "true":
        autoplay.click()
        
    pause = driver.find_element_by_xpath("//button[contains(@class, 'ytp-play-button')]")
    
    if pause.get_attribute("aria-label") == "Play (k)":
        pause.click()

In [9]:
def in_seconds(timestamp):
    """Converts a YouTube timestamp to a number of seconds"""
    # Order times in ascending order
    times = timestamp.split(":")[::-1]
    
    # Convert to seconds
    return sum([int(times[i]) * 60**i for i in range(len(times))])

In [10]:
def check_video_running(driver, watch_time, vid_nr, n_vids, uid):
    """Checks if a video is still running"""
    # Make sure the settings are folded out so that the timestamp is visible
    settings = driver.find_element_by_xpath("//button[contains(@class, 'settings')]")
    if not settings.get_attribute("aria-expanded"):
        try:
            settings.click()
        except:
            pass
    
    timestamp = driver.find_element_by_class_name("ytp-time-current").text
    
    clear_output(wait=True)
    print(f"User : {uid}\nVideo: {vid_nr}/{n_vids}\nTime : {in_seconds(timestamp)}/{int(watch_time)}")
    
    # Sometimes the timestamp cannot be displayed
    if timestamp:   
        return in_seconds(timestamp) <= watch_time
                    
    # Check if the video is done (can happen when ads play and the video is very short)
    try:
        driver.find_element_by_xpath("//div[contains(@class, 'ended-mode')]")
        return False
    except: # Video is still running
        return True

In [11]:
def zipf(alpha, N):
    """Calculates Zipf distribution given variables"""
    denominator = sum([1/n**alpha for n in range(1, N + 1)])
    return [(1/k**alpha)/denominator for k in range(1, N + 1)]

In [12]:
def choose_recommendation(recommendations):
    """Selects a video to watch based on a top 20 of given recommendations"""
 
    # Choose a video according to the zipf-distribution of video-selection (Zhou et al., 2010)    
    odds = zipf(0.78, len(recommendations))    
    choice = np.random.choice(recommendations, p = odds)   
        
    vid_len = pafy.new(choice).length
    
    # Make sure no livestreams are clicked
    while not vid_len or vid_len > 3600:
        choice = np.random.choice(recommendations, p = odds)
        vid_len = pafy.new(choice).length
        
    # Calculate how much of the video will be watched
    percentage = np.random.normal(0.55, 0.25)
    
    # Make sure a video is watched at most 100%
    if percentage > 1:
        percentage = 1
    elif percentage < 0.1:
        percentage = 0.1
            
    # Calculate how much of the video will be watched
    watch_time = percentage * vid_len
    
    return watch_time, choice.replace("https://www.youtube.com/watch?v=", "")

In [13]:
def get_video_info(driver):
    """Gets basic information about a video"""
    
    views = driver.find_element_by_xpath("//span[contains(@class, 'view-count')]").text
    
    likes, dislikes = driver.find_elements_by_xpath("//yt-formatted-string[contains(@aria-label, 'like')]")
    try:
        likes, dislikes = likes.text, dislikes.text
    except: # Likes are sometimes disables
        likes, dislikes = None, None
    
    date = driver.find_element_by_xpath("//div[@id = 'date']/yt-formatted-string").text
    url = driver.current_url
    
    return int(views.replace(",", "").replace(" views", "")), likes, dislikes, date, url

# Het experiment

In [44]:
def experiment_part_1(videos, user, n):
    """Watches YouTube videos on different YouTube accounts in different
       ways. Videos are either retrieved from a dataset or chosen from the
       recommendations. 
       
       videos: the dataset containing videos (labeled as conspiracy True/False)
       user: a dataset with YouTube-accounts (mail, password, usertype)
       n: the number of videos to be watched during the experiment
       
       returns two dataframes: one containing the homepage recommendations after
       each watched video, and one containing all the videos that have been watched.""" 
        
    # Initialize variables
    experiment_results = defaultdict(list)
    videos_per_user = defaultdict(list)
    crashes = defaultdict(list)
    rec_to_watch = None

    # Find current user's info
    uid, mail, password, usertype = user[0], user[1], user[2], user[3]
    
    # Initialize driver
    driver = init_driver()    

    # Login to google
    login_google(driver, mail, password)
        
    # Start watching videos
    for i in range(1, n + 1):
        try:
            # If the next video is not a recommendation (usertype 1 or 2)
            if not rec_to_watch:
                # Get a random video from the dataset and watch it
                watch_time, to_watch = vid_to_watch(videos, usertype, experiment_part = 1, user = uid)
                driver.get(f"http://youtube.com/watch?v={to_watch}")
                vid_running = True
            else: # If it is one of the recommendations (usertype 3 or 4)
                # Watch the recommendation
                driver.get(f"http://youtube.com/watch?v={rec_to_watch}")
                vid_running = True

            # Skip ads, disable autoplay
            prepare_video(driver)

            # Save information of current video being watched
            views, likes, dislikes, date, url = get_video_info(driver)
            videos_per_user["user"].append(mail)
            videos_per_user["video_number"].append(i)
            videos_per_user["url"].append(url)
            videos_per_user["views"].append(views)
            videos_per_user["likes"].append(likes)
            videos_per_user["dislikes"].append(dislikes)
            videos_per_user["date"].append(date)         
                        
            # Watch video
            while check_video_running(driver, watch_time, i, n, uid):
                time.sleep(1)

            # If we have a usertype that relies on direct recommendations
            if usertype == 3:
                video_recs = driver.find_elements_by_xpath(
                    "//a[contains(@class, 'ytd-compact-video-renderer')]")
                watch_time, rec_to_watch = choose_recommendation([rec.get_attribute("href")
                                                                  for rec in video_recs][:20])

            # Go to the youtube homepage
            driver.get("http://youtube.com")
            time.sleep(1)

            # Get videos on youtube home
            channels = driver.find_elements_by_xpath("//a[@id = 'avatar-link']")
            vids = driver.find_elements_by_xpath("//a[@id = 'video-title-link']")

            # If we have a user that relies on homepage recommendations
            if usertype == 4:
                watch_time, rec_to_watch = choose_recommendation([rec.get_attribute("href")
                                                                  for rec in vids][:20])

            # Get top 30 recommendations
            for rec in range(30):
                experiment_results["user"].append(mail)
                experiment_results["vids_watched"].append(i)

                experiment_results["video"].append(vids[rec].get_attribute("href"))
                experiment_results["channel"].append(channels[rec].get_attribute("href"))
                
        except Exception as e: # if the video somehow doesn't get watched
            # Store where the crash happened and why
            crashes["user"].append(user[0])
            crashes["video number"].append(i)
            crashes["reason"].append(repr(e))
            
    driver.quit()        
    return pd.DataFrame(experiment_results), pd.DataFrame(videos_per_user), pd.DataFrame(crashes)

In [22]:
def experiment_part_2(videos, user, n):
    """Watches non-conspiracy videos on accounts in filter bubbles to see
       how quickly they can escape the bubble. 
   
       videos: the dataset containing videos (labeled as conspiracy True/False)
       user: a dataset with YouTube-accounts (mail, password, usertype)
       n: the number of videos to be watched during the experiment
       
       returns two dataframes: one containing the homepage recommendations after
       each watched video, and one containing all the videos that have been watched.""" 
    
    # Initialize variables
    experiment_2_results = defaultdict(list)
    videos_per_user_2 = defaultdict(list)
    crashes_2 = defaultdict(list)

    # Find current user's info
    uid, mail, password = user[0], user[1], user[2]
    
    # Initialize driver
    driver = init_driver()    

    # Login to google
    login_google(driver, mail, password)
    
    # Start watching videos
    for i in range(1, n + 1):
        try:
            # Get a random video from the dataset and watch it
            watch_time, to_watch = vid_to_watch(videos, usertype, experiment_part = 2)
            driver.get(f"http://youtube.com/watch?v={to_watch}")
            vid_running = True

            # Skip ads, disable autoplay
            prepare_video(driver)

            # Save information of current video being watched
            views, likes, dislikes, date, url = get_video_info(driver)

            videos_per_user_2["user"].append(mail)
            videos_per_user_2["video_number"].append(i)
            videos_per_user_2["url"].append(url)
            videos_per_user_2["views"].append(views)
            videos_per_user_2["likes"].append(likes)
            videos_per_user_2["dislikes"].append(dislikes)
            videos_per_user_2["date"].append(date)         
            
            # Watch video
            while check_video_running(driver, watch_time, i, n, uid):
                time.sleep(1)

            # Go to the youtube homepage
            driver.get("http://youtube.com")
            time.sleep(1)

            # Get videos on youtube home
            channels = driver.find_elements_by_xpath("//a[@id = 'avatar-link']")
            vids = driver.find_elements_by_xpath("//a[@id = 'video-title-link']")

            # Get top 30 recommendations
            for rec in range(30):
                experiment_2_results["user"].append(mail)
                experiment_2_results["vids_watched"].append(i)

                experiment_2_results["video"].append(vids[rec].get_attribute("href"))
                experiment_2_results["channel"].append(channels[rec].get_attribute("href"))
                
        except Exception as e: # if the video somehow doesn't get watched
            # Store where the crash happened and why
            crashes_2["user"].append(user[0])
            crashes_2["video number"].append(i)
            crashes_2["reason"].append(repr(e))
            
    driver.quit()         
    return pd.DataFrame(experiment_2_results), pd.DataFrame(videos_per_user_2), pd.DataFrame(crashes_2)

In [23]:
logs_1 = pd.DataFrame(columns=["user", "video number", "reason"])

# Go over every user in the csv
for user in mails.itertuples():
    # Run experiment part 1 for current user
    recommendations, watched_videos, crashes = experiment_part_1(videos, user, 15)
        
    # Save results for current user
    recommendations.to_csv(f"recommendations_user_{user[0]}.csv")
    watched_videos.to_csv(f"watched_videos_user_{user[0]}.csv")
    
    if len(crashes):
        logs_1 = pd.concat([logs_1, crashes], ignore_index=True)
        
    break
        
if len(logs_1):
    logs_1.to_csv("logs_experiment_1.csv")
else:
    print("Script executed without errors!")

User : 0
Video: 2/2
Time : 1058/1057
Script executed without errors!


In [None]:
logs_2 = pd.DataFrame(columns=["user", "video number", "reason"])

# Go over every user in the csv
for user in notebook.tqdm(mails.itertuples()):
    # Run experiment part 2
    recommendations_2, watched_videos_2, crashes_2 = experiment_part_2(videos, user, 15)
            
    # Store the results
    recommendations_2.to_csv(f"recommendations_part_2_user_{user[0]}.csv")
    watched_videos_2.to_csv(f"watched_videos_part_2_user_{user[0]}.csv")
    
    if len(crashes):
        logs_2 = pd.concat([logs_2, crashes], ignore_index=True)
    
    break
    
if len(logs_2):
    logs_2.to_csv("logs_experiment_2.csv")
else:
    print("Script executed without errors!")