In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
file: str = 'urls.txt'
with open(file, 'r') as f:
    urls = f.readlines()
    webs = [url.strip() for url in urls]

In [3]:
# Define client headers
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

In [4]:
variables = ["Website", "Images", "Gifs", "Title Size", "Links", "Languages", "h1", "h2", "h3", "HTTPS", "Paragraphs", "Meta Description", "Meta Keywords", "JS Scripts", "Size KB", "Words", "Cookies", "MENA"]
df = pd.DataFrame(columns=variables)
df["Website"] = webs

In [5]:
def get_numImages(soup):
    return len(soup.find_all('img'))

def get_numGifs(soup):
    return len(soup.find_all('gif'))

def is_https(url):
    return url.startswith('https')

def get_titleSize(soup):
    try:
        return len(soup.title.string)
    except:
        return None

def get_numLinks(soup):
    return len(soup.find_all('a'))

def get_languages(soup):
    try:
        return len(soup.html["lang"])
    except:
        return 0

def get_numH1(soup):
    return len(soup.find_all('h1'))

def get_numH2(soup):
    return len(soup.find_all('h2'))

def get_numH3(soup):
    return len(soup.find_all('h3'))

def get_Paragraphs(soup):
    return len(soup.find_all('p'))

def has_metadescription(soup):
    return len(soup.find_all('meta', attrs={'name': 'description'})) > 0

def has_metakeywords(soup):
    return len(soup.find_all('meta', attrs={'name': 'keywords'})) > 0

def javascript_size(soup):
    return sum([len(script) for script in soup.find_all('script')])

def num_words(soup):
    return len(soup.text.split())

In [6]:
for i, web in enumerate(webs):
    try:
        response = requests.get(web, headers=headers)
    except:
        print("Error: Could not connect to ", web)
        continue
    if response.status_code != 200:
        print("Error: ", response.status_code, " for ", web)
        continue
    soup = BeautifulSoup(response.text, 'html.parser')
    df.loc[df["Website"] == web, "Images"] = get_numImages(soup)
    df.loc[df["Website"] == web, "Gifs"] = get_numGifs(soup)
    df.loc[df["Website"] == web, "HTTPS"] = is_https(web)
    df.loc[df["Website"] == web, "Title Size"] = get_titleSize(soup)
    df.loc[df["Website"] == web, "Links"] = get_numLinks(soup)
    df.loc[df["Website"] == web, "Languages"] = get_languages(soup)
    df.loc[df["Website"] == web, "h1"] = get_numH1(soup)
    df.loc[df["Website"] == web, "h2"] = get_numH2(soup)
    df.loc[df["Website"] == web, "h3"] = get_numH3(soup)
    df.loc[df["Website"] == web, "Paragraphs"] = get_Paragraphs(soup)
    df.loc[df["Website"] == web, "Meta Description"] = has_metadescription(soup)
    df.loc[df["Website"] == web, "Meta Keywords"] = has_metakeywords(soup)
    df.loc[df["Website"] == web, "Size KB"] = len(response.content) / 1024
    df.loc[df["Website"] == web, "JS Scripts"] = javascript_size(soup)
    df.loc[df["Website"] == web, "Words"] = num_words(soup)
    df.loc[df["Website"] == web, "Cookies"] = 'Set-Cookie' in response.headers
    df.loc[df["Website"] == web, "MENA"] = "TRUE" if i >= 91 else "FALSE"
    
    

Error:  403  for  https://www.surfer.com/
Error:  403  for  https://www.sportspromedia.com/
Error:  403  for  https://www.fei.org/
Error:  403  for  https://www.nascar.com/
Error: Could not connect to  https://www.worldsquash.org/
Error:  403  for  https://www.usatoday.com/sports/
Error: Could not connect to  https://www.kingfut.com/
Error: Could not connect to  https://www.gdnonline.com/


In [29]:
df2 = pd.read_csv('./PageSpeed/PageSpeed_final.csv')
df2.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)

# Join the two dataframes by website (df2) and Link (df)
df3 = pd.merge(df, df2, left_on='Website', right_on='Link')

In [30]:
# Drop rows with NaN values
df3.dropna(inplace=True)
df3.drop(['Link'], axis=1, inplace=True)

In [31]:
variables_ps = {'LCP_m':'s', 'LCP_o':'s', 'INP_m':'ms', 'INP_o':'ms', 'FCP_m':'s', 'FCP_o':'s', 'TTF_m':'s', 'TTF_o':'s', 'CLS_m':'', 'CLS_o':'', 'FID_m':'ms', 'FID_o':'ms'}
for var in variables_ps:
    df3[var] = df3[var].apply(lambda x: float(str(x).split(' ')[0]))
df3[variables_ps.keys()]

for var, v in variables_ps.items():
    # rename var in df3
    if v != '':
        df3.rename(columns={var: var + f'({v})'}, inplace=True)


In [33]:
df3.to_csv('final_dataset.csv', index=False)