In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import regex as re
from jellyfish import jaro_similarity
from tqdm import tqdm

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from time import sleep

In [3]:
filename = "covid_philippines"
path = "datasets/" + filename + "/" + filename + ".csv"
video_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)

In [4]:
video_df.head()

Unnamed: 0,video_id,video_title,description,video_dop,view_count,like_count,comment_count,channel_id,video_transcript
0,aLZ85hb4wjE,Normally crowded streets of Philippine capital...,Subscribe to our YouTube channel for free here...,2020-04-02,325340,3291,619,UC4SUWizzKc1tptprBkWjX2Q,
1,sYI97jv-pZg,"PH records 2,725 new COVID cases from Dec. 12 ...",The World Tonight: The daily average of the Ph...,2023-12-18,2329,15,3,UCvi6hEzLM-Z_unKPSuuzKvg,
2,3YFpjgIQqEo,WATCH: DOH Usec. Tayag on rise of COVID-19 cas...,Dateline Philippines: Karmina Constantino talk...,2023-12-07,9812,81,17,UCvi6hEzLM-Z_unKPSuuzKvg,
3,dIsaz_XlmTw,DOH logs 18 cases of JN.1 COVID-19 variant in PH,The Department of Health says it has already d...,2023-12-24,38328,89,81,UCvRAX-ujvZ0eTMLGG2vki9w,the Department of Health on Sunday said it ha...
4,DWxIvQlpJK8,Metro Manila to be placed on lockdown due to c...,Subscribe: https://www.youtube.com/@Rappler/\n...,2020-03-12,107280,750,19,UCdnZdQxYXnbN4uWJg96oGxw,


In [5]:
video_id_list = video_df["video_id"].tolist()

In [6]:
for video_id in video_id_list[0:5]:
    print(f"https://www.youtube.com/watch?v={video_id}")

https://www.youtube.com/watch?v=aLZ85hb4wjE
https://www.youtube.com/watch?v=sYI97jv-pZg
https://www.youtube.com/watch?v=3YFpjgIQqEo
https://www.youtube.com/watch?v=dIsaz_XlmTw
https://www.youtube.com/watch?v=DWxIvQlpJK8


---

In [7]:
# Automated
def get_backlinks(video_id_list):
    backlinks = {}
    index = 0
    
    driver = webdriver.Firefox()
    driver.get("https://www.thehoth.com/backlinks-checker/")
    driver.maximize_window()
    
    captcha_passed = False
    
    try:
        for video_id in video_id_list:
            # print("Inputting video link")
            # Wait for page to load and locate textbox
            # print("Waiting for frame")
            WebDriverWait(driver, 300).until(
                    EC.frame_to_be_available_and_switch_to_it((By.ID, "hothtools"))
                )

            # Wait for textbox to be clickable and then click
            # print("Waiting for textbox")
            WebDriverWait(driver, 300).until(
                    EC.element_to_be_clickable((By.ID, "targeturl"))
                )

            # Input video link
            targeturl = driver.find_element(By.ID, "targeturl")
            targeturl.click()
            targeturl.send_keys(Keys.CONTROL + "A")
            targeturl.send_keys(Keys.BACKSPACE)
            targeturl.send_keys(f"https://www.youtube.com/watch?v={video_id}")

            # Select 'This Exact URL'
            dropdown = driver.find_element(By.ID, "mode")
            dropdown.click()
            option = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/form/div[2]/select/option[3]")
            option.click()

            # Submit
            submit = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/form/div[2]/button")
            submit.click()

            sleep(5)

            try:
                if not captcha_passed:
                    # print("Solving CAPTCHA")
                    # In case of Captcha page
                    iframes = driver.find_elements(By.TAG_NAME, "iframe")
                    cont = False
                    for iframe in iframes:
                        try:
                            driver.switch_to.frame(iframe)
                            checkbox = driver.find_element(By.ID, "recaptcha-anchor")
                            checkbox.click()
                            cont = True
                            break
                        except:
                            # print("Not Found")
                            driver.switch_to.default_content()

                    if not cont:
                        raise Exception("No CAPTCHA iframe found.")

                    # Giving enough time to manually solve CAPTCHA puzzle
                    sleep(15)

                    driver.switch_to.default_content()
                    driver.switch_to.frame("hothtools")
                    submit = driver.find_element(
                        By.XPATH, "//*[@id='submit']"
                    )
                    submit.click()
                    captcha_passed = True
                    sleep(3)
            except:
                pass
            finally:
                # print("Getting results")
                # Get external backlinks
                driver.switch_to.default_content()
                driver.switch_to.frame("hothtools")
                try:
                    WebDriverWait(driver, 300).until(
                        EC.element_to_be_clickable((By.ID, "targeturl"))
                    )
                    
                    result = driver.find_element(
                        By.XPATH, "/html/body/div[2]/div/div/div[2]/div[2]/div/div/div[1]/div[2]"
                    ).text
                except:
                    result = 0
                finally:
                    backlinks[index] = {video_id: int(result)}
                    index += 1
                    driver.switch_to.default_content()
    
    except:
        print("Error raise by Selenium Webdriver. Ending process and saving data.")
    
    return backlinks

In [8]:
# Manual
def set_backlinks(video_id_list, backlinks):
    print("Copy the YouTube link next to the textbox and find the # of backlinks from the following website:")
    print("https://www.thehoth.com/backlinks-checker/")
    print("------------------------------")
    
    for video_id in video_id_list:
        num = int(input(f"https://www.youtube.com/watch?v={video_id} "))
        backlinks[video_id] = num

In [9]:
backlinks = get_backlinks(video_id_list)

In [None]:
accomplished_until = list(backlinks.keys())[len(list(backlinks.keys())) - 1]
last_index = video_id_list.index(accomplished_until)
set_backlinks(video_id_list[last_index:len(video_id_list)], backlinks)

In [11]:
backlinks

{0: {'5DvMPgoKZmM': 0}}