In [119]:
# !pip install cdx_toolkit

In [2]:
import cdx_toolkit
from bs4 import BeautifulSoup
import requests
import json
import io
import gzip
from textblob import TextBlob

In [48]:
def search_domain(domain):
    record_list = []
    index_list = ['29','24']
    index_list = ["2020-"+i for i in index_list]
    print ("Trying target domain: {}".format(domain))
    for index in index_list:
        print ("Trying index {}".format(index))
        cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
        cc_url += "url=%s&matchType=domain&output=json" % domain
        response = requests.get(cc_url)
        if response.status_code == 200:
            records = response.content.splitlines()
            for record in records:
                record_list.append(json.loads(record))  
            print("Added {} results.".format(len(records)))
    print("Found a total of {} hits.".format(len(record_list)))
    return record_list

In [49]:
record = search_domain("https://www.henryford.com/news")

Trying target domain: https://www.henryford.com/news
Trying index 2020-29
Added 12109 results.
Found a total of 12109 hits.


In [50]:
record[0]

{'urlkey': 'com,henryford)/',
 'timestamp': '20200709183308',
 'status': '200',
 'url': 'https://www.henryford.com/',
 'mime': 'text/html',
 'digest': 'L67VRN4WNCAVYQFG4RYQDCGPZZK3SWOQ',
 'charset': 'UTF-8',
 'offset': '918916517',
 'filename': 'crawl-data/CC-MAIN-2020-29/segments/1593655900614.47/warc/CC-MAIN-20200709162634-20200709192634-00208.warc.gz',
 'length': '18857',
 'mime-detected': 'text/html',
 'languages': 'eng'}

In [51]:
# !pip install boto3

In [52]:
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://commoncrawl.s3.amazonaws.com/'

    # We can then use the Range header to ask for just this set of bytes
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

    # The page is stored compressed (gzip) to save space
    # We can extract it using the GZIP library
    raw_data = io.BytesIO(resp.content)
    f = gzip.GzipFile(fileobj=raw_data)

    # What we have now is just the WARC response, formatted:
    data = f.read()

    response = ""

    if len(data):
        try:
            warc, header, response = data.strip().decode("utf-8").split('\r\n\r\n', 2)
        except:
            pass

    return response

In [53]:
res = []
for i in record:
    res.append(download_page(i))

In [78]:
import re

In [96]:
[t for t in [re.sub(r'\W+', ' ', i).strip().lower() for i in [t.text for t in BeautifulSoup(res[0], "html.parser").find_all('p')]] if any(x in t for x in ['hydroxychloroquine','chloroquine']) & ('covid' in 'hydroxychloroquine covid')]

['detroit treatment with hydroxychloroquine cut the death rate significantly in sick patients hospitalized with covid 19 and without']

In [131]:
[i.find('body') for i in BeautifulSoup(res[0], "html.parser").find_all('body', {'class':'henryford'}, limit=1)]

[None]

### Just couldn't get the URL name for the individual page. Does the beautiful soup output not have that?

Instead I am returning, for the domains we have, how many total number of webpages that have misinformation present.

In [144]:
count=0
for r in res:
    if len([t for t in [re.sub(r'\W+', ' ', i).strip().lower() for i in [t.text for t in BeautifulSoup(r, "html.parser").find_all('p')]] if any(x in t for x in ['hydroxychloroquine','chloroquine']) & ('covid' in 'hydroxychloroquine covid')]):
        print("Possible webpage with misinformation found!")
        text_list = [re.sub(r'\W+', ' ', i).strip().lower() for i in [t.text for t in BeautifulSoup(r, "html.parser").find_all('p')]]
        text_combined = ' '.join(text_list)
        
        print("Confirming if this was the case")
        if (TextBlob(text_combined).sentiment.polarity)>0:
            print("Confirmed that this was indeed malicious.")
            count+=1
print("For the domains we have, the total number of webpages that have the misinformation present:",count)

Possible webpage with misinformation found!
Confirming if this was the case
Confirmed that this was indeed malicious.
Possible webpage with misinformation found!
Confirming if this was the case
Confirmed that this was indeed malicious.
Possible webpage with misinformation found!
Confirming if this was the case
Confirmed that this was indeed malicious.
For the domains we have, the total number of webpages that have the misinformation present: 3


In [143]:
# !pip install textblob

### When the range of Textblob polarity is from -1 to +1 hence this is a positive sentiment

When the website is giving a positive sentiment, it is a website which is promoting hydroxychloroquine or chloroquine for treatment of COVID.
This code can likewise be tested for other websites too in order to check its validity for misinformation

Thus I provide the count of webpages having misinformation in these domains

## Main Code

Running for multiple domains:

In [None]:
for d in ["https://www.henryford.com/news","https://www.hss.edu/","https://www.sciencenews.org/article"]:
    record = search_domain(d)
    res = []
    for i in record:
        res.append(download_page(i))
    count=0
    for r in res:
        if len([t for t in [re.sub(r'\W+', ' ', i).strip().lower() for i in [t.text for t in BeautifulSoup(r, "html.parser").find_all('p')]] if any(x in t for x in ['hydroxychloroquine','chloroquine']) & ('covid' in 'hydroxychloroquine covid')]):
            print("Possible webpage with misinformation found!")
            text_list = [re.sub(r'\W+', ' ', i).strip().lower() for i in [t.text for t in BeautifulSoup(r, "html.parser").find_all('p')]]
            text_combined = ' '.join(text_list)

            print("Confirming if this was the case")
            if (TextBlob(text_combined).sentiment.polarity)>0:
                print("Confirmed that this was indeed malicious.")
                count+=1
    print("For the domain ",d," the total number of webpages that have the misinformation present:",count)