# Exploration:  Domain Extractor for Finicial-Related Terms
Author: Runting Shao

In [1]:
#Import python package 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import ahocorasick



In [2]:
train_data = pd.read_csv('TCCSocialMediaData_combined_clean_emotes.csv')
#test_data = pd.read_csv('Data/Cleaned_Data/TCCSocialMediaData_test_clean.csv')
unique_domain_train = pd.unique(train_data.domain)
#unique_domain_test = pd.unique(test_data.domain)
#l1 = [x for x in unique_domain_train]
#l2 = [x for x in unique_domain_test]
#unique_domain = np.unique(l1 + l2).tolist()
print(len(unique_domain_train))

2615


In [3]:
domains = unique_domain_train

In [4]:
search_terms = {
    1: ["donation","donate","patron"],# Donate, Be a Patron etc. 
    2: ["store","shop"], # Shop, Shoping, Shop with us etc.
    3: ["subscribe","subscription","membership"],
    4: ["advertis"], # Advertise, Advertising, Advertisement
    5: ["sale","deal","discount","% off","low price","coupon"],
    6: ["free","no cost"],
    7: ["money","cash","dollar"],
    8: ["pay","buy","earn"],
    9: ["newsletter"]
}

In [5]:
#This function uses the Aho-Corasick Algorithm to count the existence of a category of terms in a text string
def ahocorasickCount(terms, text):
    count = 0
    # Make a searcher
    searcher = ahocorasick.Automaton()
    for i, term in enumerate(terms):
        searcher.add_word(term, i)
    searcher.make_automaton()
    # Add up all counts for a category of terms
    for _ in searcher.iter(text):
        count = count + 1
    return count

In [6]:
class finicialTermsExtractor():
    
    def __init__(self, userAgent):
        self.userAgent = userAgent
    
    '''Read through all the domains and get all availiable htmls
    Parse the htmls with beautiful soup
    Input: domains - list of domains
    Output: 1. accessacle_domain - a dictionry with domain(key) and its html after parsing(value)
            2. errors - a list of domains that are not able to open'''
    def htmlCrawler(self, domains):
        errors = []
        accessable_domain = {}
        headers = {'userAgent' : self.userAgent}
        print("Html Crawl Progress - Getting html for all domains:")
        for d in tqdm(domains):
            fulllink = "http://www." + d
            try:
                req = requests.get(fulllink,headers,timeout=5)
                soup = BeautifulSoup(req.text, "html.parser")
                accessable_domain.update({d:soup})
            except Exception as e:
                errors.append(d)
        return accessable_domain, errors

    '''Check if the terms in "dict_sublink" existed in the sublink of the domain
    Input: 1. accessable_domains -  a dictionry with domain(key) and its html after parsing(value)
           2. dict_sublink - dictionary of terms to be checked
    Output: result - a dictionary containing categories of terms (key), whether terms exist in sublink(T/F)
            Append true for a category if any of the terms in the category existed as sublink'''
    def checkSublink(self, accessable_domains, search_terms):
        result = search_terms.copy()
        #Initialize a result dict
        for k in result.keys():
            result.update({k: []})
        for domain in tqdm(accessable_domains):
            domain_name = domain
            if '.'in domain:
                domain_name = domain[:domain.index('.')]
            soup = accessable_domains.get(domain)
            all_link_txt = soup.get_text()
            #Get all link text and search for terms
            for link in soup.find_all('a'):
                href = link.get("href")
                # Define if a href exists and not belong to the domain and text exists
                if(href and (href[0] == '/' or href[0] =='#' or domain_name in href) and link.string):
                        all_link_txt = all_link_txt + link.string.lower()
            for category in search_terms:
                terms = search_terms.get(category) # list of finicial related terms
                count = ahocorasickCount(terms, all_link_txt)# count terms existed in all_link_txt
                result.get(category).append(count)
        return result
    
    '''Check if the terms in "dict_adcontent" existed in the text from third-party domains
    Input: 1. accessable_domains -  a dictionry with domain(key) and its html after parsing(value)
           2. dict_adcontent - dictionary of terms to be checked
    Output: result - a dictionary containing categories of terms (key), count of terms existed in text
            Append count of all terms existed for a category'''
    def checkAdContent(self, accessable_domains, search_terms):
        #Initialize result
        result = search_terms.copy()
        for k in result.keys():
            result.update({k: []})
        
        for domain in tqdm(accessable_domains):
            #Get domain name - domain: oann.com, domain name: oann
            domain_name = domain
            if '.'in domain:
                domain_name = domain[:domain.index('.')]
            soup = accessable_domains.get(domain)
            ad_txt = ""
            for link in soup.find_all('a'):
                href = link.get("href")
                # Define if a href exists and not belong to the domain
                if(href and href[0] != '/' and href[0] !='#' and domain_name not in href):
                    # if the text exists
                    if(link.string):
                        ad_txt = ad_txt + link.string.lower()
                    # if img-alt exists
                    for img in link.find_all('img', alt= True):
                        ad_txt = ad_txt + img['alt'].lower()
            for category in search_terms:
                terms = search_terms.get(category) # list of finicial related terms
                count = ahocorasickCount(terms, ad_txt)# count terms existed in ad_txt
                result.get(category).append(count)
        return result

In [7]:
#Apply user agent
userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53'
crawler = finicialTermsExtractor(userAgent)
#Getting parsed htmls
accessable_domain, errors = crawler.htmlCrawler(domains)

Html Crawl Progress - Getting html for all domains:


100%|████████████████████████████████████████████████████████████████████████████| 2615/2615 [1:16:21<00:00,  1.75s/it]


In [8]:
print(accessable_domain.keys())






In [9]:
result_sublink = crawler.checkSublink(accessable_domain,search_terms)

100%|██████████████████████████████████████████████████████████████████████████████| 2482/2482 [00:35<00:00, 69.80it/s]


In [10]:
result_adcontent = crawler.checkAdContent(accessable_domain,search_terms)

100%|█████████████████████████████████████████████████████████████████████████████| 2482/2482 [00:04<00:00, 555.16it/s]


In [11]:
df_sublink = pd.DataFrame.from_dict(result_sublink)
df_sublink.insert(loc=0, column='domain', value = accessable_domain.keys())
df_sublink.columns = ["sublink_" + str(c) for c in list(df_sublink.columns)]
df_sublink = df_sublink.rename(columns = {'sublink_domain': 'domain'})
df_sublink.head()

Unnamed: 0,domain,sublink_1,sublink_2,sublink_3,sublink_4,sublink_5,sublink_6,sublink_7,sublink_8,sublink_9
0,tmz.com,0,6,10,2,18,2,14,17,11
1,forbes.com,0,28,11,3,5,2,60,21,17
2,msn.com,0,5,0,0,6,2,5,7,0
3,nbcnews.com,0,46,0,4,17,9,3,21,12
4,newsone.com,0,0,5,0,0,0,0,0,4


In [12]:
df_adcontent = pd.DataFrame.from_dict(result_adcontent)
df_adcontent.insert(loc=0, column='domain', value = accessable_domain.keys())
df_adcontent.columns = ["adcontent_" + str(c) for c in list(df_adcontent.columns)]
df_adcontent = df_adcontent.rename(columns = {'adcontent_domain': 'domain'})
df_adcontent.head()

Unnamed: 0,domain,adcontent_1,adcontent_2,adcontent_3,adcontent_4,adcontent_5,adcontent_6,adcontent_7,adcontent_8,adcontent_9
0,tmz.com,0,2,0,0,0,0,0,0,0
1,forbes.com,0,0,0,0,0,0,0,0,0
2,msn.com,0,0,0,1,0,0,0,0,0
3,nbcnews.com,0,0,0,1,0,0,0,3,0
4,newsone.com,0,0,0,0,0,0,0,0,0


In [13]:
df_sublink.to_csv("data_sublink.csv")
df_adcontent.to_csv("data_adcontent.csv")

In [14]:
print(errors)

['seattletimes.com', 'lawenforcementtoday.com', 'cnsnews.com', 'washingtonpost.com', 'gopdailybrief.com', 'pittsburgh.cbslocal.com', 'bipartisanreport.com', 'politi.co', 'disrn.com', 'miamiherald.com', 'losangeles.cbslocal.com', 'newsobserver.com', 'thepatriotjournal.com', 'wtop.com', 'sonsoflibertymedia.com', '2020electioncenter.com', 'americanindependent.com', 'people.com', 'americanjournaldaily.com', 'charlotteobserver.com', 'mcclatchydc.com', 'wired.com', 'sacbee.com', 'usnews.com', 'currently.att.yahoo.com', 'chicago.cbslocal.com', 'heritage.org', 'newyork.cbslocal.com', 'kansascity.com', 'bearingarms.com', 'baltimore.cbslocal.com', 'fresnobee.com', 'mol.im', 'sacramento.cbslocal.com', 'news.sky.com', 'spectator.co.uk', 'modbee.com', 'theduran.com', 'philadelphia.cbslocal.com', 'star-telegram.com', 'gazettenet.com', 'forbiddenknowledgetv.net', 'idahostatesman.com', 'kentucky.com', 'georgiastarnews.com', 'jewishpress.com', 'hereistheevidence.com', 'thestate.com', 'boston.cbslocal.c