In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import regex as re
from jellyfish import jaro_similarity
from tqdm import tqdm

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from time import time
from time import sleep
from time import strftime
from time import gmtime
from selenium.common.exceptions import NoSuchElementException

In [3]:
filename = "covid_vaccine"
path = "datasets/" + filename + "/" + filename + ".csv"
video_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)

In [4]:
video_id_list = video_df["video_id"].tolist()

In [5]:
for video_id in video_id_list[0:5]:
    print(f"https://www.youtube.com/watch?v={video_id}")

https://www.youtube.com/watch?v=im3otpqYAiQ
https://www.youtube.com/watch?v=uiwjAj0zfKQ
https://www.youtube.com/watch?v=LfmhYVCCGhc
https://www.youtube.com/watch?v=SU_SSfiYtfM
https://www.youtube.com/watch?v=7MAlEYqWUTk


---

In [6]:
# Automated
def get_video_backlinks(video_id_list):
    start_time = time()
    
    backlinks = {}
    backlinks["video_id"] = {}
    backlinks["backlinks"] = {}
    
    for i in range(0, len(video_id_list)):
        backlinks["video_id"][i] = video_id_list[i]
    
    index = 0
    cont = True
    
    driver = webdriver.Firefox()
    driver.get("https://www.thehoth.com/backlinks-checker/")
    
    captcha_passed = False
    
    pbar = tqdm(total=len(video_id_list))
    pbar.set_description("Getting backlinks...")
    
    # print("Loop start")
    sleep(5)
    video_id_list = video_id_list[index:len(video_id_list)]
    
    
    
    for video_id in video_id_list:
        # Wait for page to load and locate textbox
        # print("Waiting for frame")
        WebDriverWait(driver, 200).until(
            EC.frame_to_be_available_and_switch_to_it((By.ID, "hothtools"))
        )
        
        sleep(1)
        
        WebDriverWait(driver, 200).until_not(
            EC.visibility_of_element_located((By.CLASS_NAME, "hoth-loader__container"))
        )
        
        # Input video link
        # print("Inputting video link")
        notFound = False
        while(not notFound):
            try:
                targeturl = driver.find_element(By.ID, "targeturl")
                notFound = True
            except:
                sleep(10)
                pass
        
        targeturl.click()
        targeturl.send_keys(Keys.CONTROL + "A")
        targeturl.send_keys(Keys.BACKSPACE)
        targeturl.send_keys(f"https://www.youtube.com/watch?v={video_id}")

        # Select 'This Exact URL'
        dropdown = driver.find_element(By.ID, "mode")
        dropdown.click()
        option = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/form/div[2]/select/option[3]")
        option.click()

        # Submit
        submit = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/form/div[2]/button")
        submit.click()

        sleep(5)

        if not captcha_passed:
            try:
                # In case of Captcha page
                # print("Solving CAPTCHA")
                iframes = driver.find_elements(By.TAG_NAME, "iframe")
                cont = False
                for iframe in iframes:
                    try:
                        driver.switch_to.frame(iframe)
                        checkbox = driver.find_element(By.ID, "recaptcha-anchor")
                        checkbox.click()
                        cont = True
                        break
                    except:
                        # print("Not Found")
                        driver.switch_to.default_content()

                if not cont:
                    raise Exception("No CAPTCHA iframe found.")

                # Giving enough time to manually solve CAPTCHA puzzle
                sleep(15)

                driver.switch_to.default_content()
                driver.switch_to.frame("hothtools")
                submit = driver.find_element(
                    By.XPATH, "//*[@id='submit']"
                )
                submit.click()
                captcha_passed = True
                sleep(3)
            except:
                pass
        
        # Get external backlinks
        # print("Getting results")
        driver.switch_to.default_content()
        driver.switch_to.frame("hothtools")
        try:
            # print("Waiting for result frame")
            WebDriverWait(driver, 200).until(
                EC.element_to_be_clickable((By.ID, "targeturl"))
            )

            result = driver.find_element(
                By.XPATH, "/html/body/div[2]/div/div/div[2]/div[2]/div/div/div[1]/div[2]"
            ).text
        except NoSuchElementException:
            result = '0'
        finally:
            backlinks["backlinks"][index] = int(result.replace(',', ''))
            # print(f"--------------------- INDEX {index} COMPLETE ({video_id}: {result}) ---------------------")
            index += 1
            pbar.update(1)
            driver.switch_to.default_content()
    
    pbar.close()
    elapsed_time = time() - start_time
    print('Execution time:', strftime("%H:%M:%S", gmtime(elapsed_time)))
    
    
    return backlinks

In [7]:
# Manual
def set_video_backlinks(video_id_list, video_backlinks):
    print("Copy the YouTube link next to the textbox and find the # of backlinks from the following website:")
    print("https://www.thehoth.com/backlinks-checker/")
    print("------------------------------")
    
    for video_id in video_id_list:
        num = int(input(f"https://www.youtube.com/watch?v={video_id} "))
        video_backlinks[video_id] = num

In [8]:
# Manual
def set_site_backlinks(site_list, site_backlinks):
    print("Copy the YouTube link next to the textbox and find the # of backlinks from the following website:")
    print("https://www.thehoth.com/backlinks-checker/")
    print("------------------------------")
    
    for link in site_list:
        pass

In [9]:
backlinks = get_video_backlinks(video_id_list)

Getting backlinks...: 100%|██████████| 150/150 [19:19<00:00,  7.73s/it]


Execution time: 00:19:47


In [11]:
bl_df = pd.DataFrame.from_dict(backlinks)
bl_df.head()

Unnamed: 0,video_id,backlinks
0,im3otpqYAiQ,26
1,uiwjAj0zfKQ,19
2,LfmhYVCCGhc,14
3,SU_SSfiYtfM,1
4,7MAlEYqWUTk,4


In [12]:
path = "datasets/" + filename + "/video_backlinks.csv"
bl_df.to_csv(path)