In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import regex as re
from jellyfish import jaro_similarity
from tqdm import tqdm

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from time import time
from time import sleep
from time import strftime
from time import gmtime
from selenium.common.exceptions import NoSuchElementException

In [None]:
filename = "covid_philippines"

In [None]:
path = "../datasets/" + filename + "/videos.csv"
video_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)

In [None]:
path = "../datasets/" + filename + "/source_links.csv"
sc_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)

---

In [None]:
# Automated
def get_backlinks(df, mode):
    backlinks = {}
    url_list = []
    index = 0
    
    if mode == 'video':
        cols = ['video_id', 'backlinks']
        for col in cols:
            backlinks[col] = {}
            
        video_id_list = df['video_id'].tolist()
        for i in range(len(video_id_list)):
            backlinks['video_id'][i] = video_id_list[i]
        
        cols = [cols[1]]
            
        for i in range(df.shape[0]):
            url_list.append("https://www.youtube.com/watch?v=" + df.iloc[i]['video_id'])
            
    elif mode == 'external':
        cols = ['channel_id', 'channel_name', 'LinkedIn', 'Wiki', 'Website', 'Twitter', 'Facebook']
        for col in cols:
            backlinks[col] = {}
            
        channel_id_list = df['channel_id'].tolist()
        channel_name_list = df['channel_name'].tolist()
        for i in range(len(channel_id_list)):
            backlinks['channel_id'][i] = channel_id_list[i]
            backlinks['channel_name'][i] = channel_name_list[i]
            
        cols = cols[2:7]
        
        for col in cols:
            for i in range(df.shape[0]):
                url_list.append(df.iloc[i][col])
    
    driver = webdriver.Firefox()
    driver.get("https://www.thehoth.com/backlinks-checker/")
    
    captcha_passed = False
    
    pbar = tqdm(total=len(url_list))
    pbar.set_description("Getting backlinks...")
    start_time = time()
    
    for col in cols:
        for i in range(df.shape[0]):
            if url_list[index] is np.nan:
                backlinks[col][i] = 0
                # print(f"--------------------- INDEX {index} COMPLETE ({col}: {i}: {result}) ---------------------")
                index += 1
                pbar.update(1)
                continue
            else:
                # Wait for page to load and locate textbox
                # print("Waiting for frame")
                WebDriverWait(driver, 200).until(
                    EC.frame_to_be_available_and_switch_to_it((By.ID, "hothtools"))
                )

                sleep(1)

                WebDriverWait(driver, 200).until_not(
                    EC.visibility_of_element_located((By.CLASS_NAME, "hoth-loader__container"))
                )

                # Input link
                # print("Inputting link")
                notFound = False
                while(not notFound):
                    try:
                        targeturl = driver.find_element(By.ID, "targeturl")
                        notFound = True
                    except:
                        sleep(10)
                        pass

                targeturl.click()
                targeturl.send_keys(Keys.CONTROL + "A")
                targeturl.send_keys(Keys.BACKSPACE)
                targeturl.send_keys(url_list[index])

                # Select 'This Exact URL'
                dropdown = driver.find_element(By.ID, "mode")
                dropdown.click()
                option = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/form/div[2]/select/option[3]")
                option.click()

                # Submit
                submit = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/form/div[2]/button")
                submit.click()

                sleep(5)

                if not captcha_passed:
                    try:
                        # In case of Captcha page
                        # print("Solving CAPTCHA")
                        iframes = driver.find_elements(By.TAG_NAME, "iframe")
                        cont = False
                        for iframe in iframes:
                            try:
                                driver.switch_to.frame(iframe)
                                checkbox = driver.find_element(By.ID, "recaptcha-anchor")
                                checkbox.click()
                                cont = True
                                break
                            except:
                                # print("Not Found")
                                driver.switch_to.default_content()

                        if not cont:
                            raise Exception("No CAPTCHA iframe found.")

                        # Giving enough time to manually solve CAPTCHA puzzle
                        sleep(15)

                        driver.switch_to.default_content()
                        driver.switch_to.frame("hothtools")
                        submit = driver.find_element(
                            By.XPATH, "//*[@id='submit']"
                        )
                        submit.click()
                        captcha_passed = True
                        sleep(3)
                    except:
                        pass

                # Get external backlinks
                # print("Getting results")
                driver.switch_to.default_content()
                driver.switch_to.frame("hothtools")
                try:
                    # print("Waiting for result frame")
                    WebDriverWait(driver, 200).until(
                        EC.element_to_be_clickable((By.ID, "targeturl"))
                    )

                    result = driver.find_element(
                        By.XPATH, "/html/body/div[2]/div/div/div[2]/div[2]/div/div/div[1]/div[2]"
                    ).text
                except NoSuchElementException:
                    result = '0'
                finally:
                    backlinks[col][i] = int(result.replace(',', ''))
                    # print(f"--------------------- INDEX {index} COMPLETE ({col}: {i}: {result}) ---------------------")
                    index += 1
                    pbar.update(1)
                    driver.switch_to.default_content()

    pbar.close()
    elapsed_time = time() - start_time
    print('Execution time:', strftime("%H:%M:%S", gmtime(elapsed_time)))
    driver.close()
            
    return backlinks

In [None]:
video_backlinks = get_backlinks(video_df, mode='video')

In [None]:
source_backlinks = get_backlinks(sc_df, mode='external')

In [None]:
vbl_df = pd.DataFrame.from_dict(video_backlinks)
vbl_df.head()

In [None]:
sbl_df = pd.DataFrame.from_dict(source_backlinks)
sbl_df.head()

In [None]:
path = "../datasets/" + filename + "/video_backlinks.csv"
vbl_df.to_csv(path)

In [None]:
path = "../datasets/" + filename + "/source_backlinks.csv"
sbl_df.to_csv(path)

---

In [None]:
path = "../datasets/" + filename + "/video_backlinks.csv"
vbl_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)
vbl_df["backlinks"].describe()

In [None]:
path = "../datasets/" + filename + "/source_backlinks.csv"
sbl_df = pd.read_csv(path).drop("Unnamed: 0", axis=1)
sbl_df[['LinkedIn', 'Wiki', 'Website', 'Twitter', 'Facebook']].describe().T