## Importing necessary packages

In [1]:
# !pip install warcio
import re
import sys
import math
import nltk
import requests
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from warcio.archiveiterator import ArchiveIterator

In [2]:
# Lemmatization of word
WNL = nltk.WordNetLemmatizer()

# Stop words in english language
eng_stop_words = stopwords.words('english')

# Function to extract text from a URL and clean the same
def data_extraction(url):
    
    # Extracting the text from a URL
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')
    
    # Removing tab space(s) and next line indicators
    cleaned_text = re.sub(r'[\n\t]', '', soup.text).split(" ")
    
    # Removing punctuations
    cleaned_text = [re.sub(r'[^\w\s]', '', word) for word in cleaned_text if word != ""]
    
    # Remmvoing stop words
    cleaned_text = [word for word in cleaned_text if word not in eng_stop_words]
    
    # Lemmatization of words
    cleaned_text = " ".join([WNL.lemmatize(word.lower()) for word in cleaned_text])
    
    return cleaned_text



# Function to identify cosine similarity between docs
def get_cosine(vec1, vec2):
    
    # Numerator value calculation
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    
    # Denominator value calculation
    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [3]:
# List of URL's for articles relating to COVID 19 economic impact
url_list = ["https://www.census.gov/library/stories/2021/03/initial-impact-covid-19-on-united-states-economy-more-widespread-than-on-mortality.html",
            "https://www.brookings.edu/research/explaining-the-economic-impact-of-covid-19-core-industries-and-the-hispanic-workforce/",
            "https://www.brookings.edu/research/ten-facts-about-covid-19-and-the-u-s-economy/",
            "https://www.bbc.com/news/business-51706225",
            "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7162753/"]

# String to concatenate and store all the text from above list of URLs after data cleaning
final_doc = ""

for url in url_list:
    final_doc = final_doc + data_extraction(url)

# Word Counter    
WORD = re.compile(r"\w+")
t1 = Counter(WORD.findall(final_doc))

In [4]:
regex = re.compile("(youtu\.be/|youtube\.com/(watch\?(.*\&)?v=|(embed|v)/))([^?&\"'>]+)")

file_name = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-10/segments/1581875141396.22/warc/CC-MAIN-20200216182139-20200216212139-00000.warc.gz"

stream = None
try:
    if file_name.startswith("http://") or file_name.startswith("https://"):
        stream = requests.get(file_name, stream=True).raw
    else:
        stream = open(file_name, "rb")

    final_url_output = []    

    for record in ArchiveIterator(stream):
    
        if record.rec_type == "warcinfo":
            continue

        if not ".com/" in record.rec_headers.get_header("WARC-Target-URI"):
            continue

        contents = (record.content_stream().read().decode("utf-8", "replace"))
    
        if contents != '':
            soup = BeautifulSoup(contents, 'html.parser')
            cleaned_text_2 = re.sub(r'[\n\t]', '', soup.text).split(" ")
            cleaned_text_2 = [re.sub(r'[^\w\s]', '', word) for word in cleaned_text_2 if word != ""]
            cleaned_text_2 = [word for word in cleaned_text_2 if word not in eng_stop_words]
            cleaned_text_2 = " ".join([WNL.lemmatize(word.lower()) for word in cleaned_text_2])
            t2 = Counter(WORD.findall(cleaned_text_2))

            cosine_similarity = get_cosine(t1, t2)

            if cosine_similarity > 0.1:
                final_url_output.append(record.rec_headers.get_header('WARC-Target-URI'))

            if len(final_url_output) == 1000:
                break
                
except:
    None

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


In [5]:
final_url_output[:10]

['http://1057news.com/2018/09/26/13/13/01/roane-county-experiencing-power-outages-and-flooded-roads/',
 'http://107.housedems.com/article/executive-budget-recommendation-summary',
 'http://123thebig3.com/2019/10/',
 'http://2fwww.openbooktoronto.com/ava_homa/main',
 'http://2fwww.openbooktoronto.com/edward_carson/main',
 'http://365daysofme.com/2015/05/19/the-chicken-chronicles-32/',
 'http://4js.com/2018-french-housing-corp-pursues-digital-transformation/?s=',
 'http://academagia.invisionzone.com/topic/3252-a-bittersweet-symphony-the-tale-of-manchester-and-captain-falshaw/?tab=comments',
 'http://actionfigurejunkies.com/pet-sematary-final-trailer/',
 'http://adamsenger07.mihanblog.com/']

In [8]:
import pandas as pd
URL_dict = {'URL_List': final_url_output}
df = pd.DataFrame(URL_dict) 
df.to_csv("Final_Output.csv", index=False)