In [1]:
%cd /content/drive/MyDrive/tl_assess

/content/drive/MyDrive/tl_assess


In [2]:
%set_env PYTHONPATH=$PYTHONPATH:/content/drive/MyDrive/tl_assess
!echo $PYTHONPATH

env: PYTHONPATH=$PYTHONPATH:/content/drive/MyDrive/tl_assess
$PYTHONPATH:/content/drive/MyDrive/tl_assess


In [3]:
!pip install textblob warcio sentence-transformers langdetect

Collecting warcio
[?25l  Downloading https://files.pythonhosted.org/packages/24/eb/060b7e1c76abf24692784d5cf9c52ec05ff21249c88515d7f03c676434db/warcio-1.7.4-py2.py3-none-any.whl (40kB)
[K     |████████████████████████████████| 40kB 4.4MB/s 
[?25hCollecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f5/5a/6e41e8383913dd2ba923cdcd02be2e03911595f4d2f9de559ecbed80d2d3/sentence-transformers-0.3.9.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 5.9MB/s 
[?25hCollecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 17.3MB/s 
Collecting transformers<3.6.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████

In [4]:
import re
import io
import sys
import gzip
import pickle
import requests
import traceback
import langdetect
from tqdm import tqdm
from langdetect import detect
from bs4 import BeautifulSoup
from textblob import TextBlob
from warcio.archiveiterator import ArchiveIterator
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
title_re = re.compile("<title>(.+?)</title>")

url_regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

In [6]:
def get_titles_from_warc(url):
    resp = requests.get(url, stream=True)

    for record in ArchiveIterator(resp.raw, arc2warc=True):
        if record.rec_type == 'warcinfo':
            continue
        
        if re.match(url_regex, record.rec_headers.get_header("WARC-Target-URI")) is None:
            continue

        elif record.rec_type is not None and record.rec_type == 'response':
            if record.http_headers is not None and  record.http_headers.get_header('Content-Type') is not None and record.http_headers.get_header('Content-Type') == 'text/html':
                html_content = record.content_stream().read().decode("utf-8", "replace")
                if html_content is not None:
                    page_uri = record.rec_headers.get_header('WARC-Target-URI')
                    if title_re.search(html_content) is not None:
                        title = title_re.search(html_content).group(1)
                        try:
                            if detect(title) == 'en' and ('covid' in title.lower() or 'corona' in title.lower() or 'pandemic' in title.lower()):
                                yield title, page_uri
                        except langdetect.lang_detect_exception.LangDetectException:
                            # traceback.print_exc()
                            continue

In [7]:
search_str = 'Economic impact of Covid-19'
relevent_uri = list()
model = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

threshold = 0.5

100%|██████████| 1.24G/1.24G [01:08<00:00, 18.3MB/s]


In [8]:
warc_list = [
    'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-50/warc.paths.gz',
    'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-45/warc.paths.gz',
    # 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-40/warc.paths.gz',
    # 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-34/warc.paths.gz',
    # 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-29/warc.paths.gz',
    # 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-24/warc.paths.gz',
    # 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-16/warc.paths.gz'
]

In [None]:
for warc_all_uri in warc_list:
    try:
        web_response = requests.get(warc_all_uri, stream=True)
        gz_file = web_response.content

        f = io.BytesIO(gz_file)
        with gzip.GzipFile(fileobj=f) as fh:
            for incomplete_uri in fh:
                incomplete_uri = incomplete_uri.decode().replace('\n', '')
                warc_uri = f'https://commoncrawl.s3.amazonaws.com/{incomplete_uri}'
                # print(f'Extracting from: {warc_uri}')
                for title, page_uri in get_titles_from_warc(warc_uri):
                    sentences = [
                        title,
                        search_str
                    ]
                    sen_embeddings = model.encode(sentences)
                    cos_sim = cosine_similarity(sen_embeddings)
                    if cos_sim[0][1] > threshold:
                        print(page_uri)
                        relevent_uri.append(page_uri)
                
                if len(relevent_uri) > 1000:
                    break
    except:
        with open('relevant_uri.pkl', 'wb') as f:
            pickle.dump(relevent_uri, f)
            traceback.print_exc()
        sys.exit(0)

with open('relevant_uri.pkl', 'wb') as f:
    pickle.dump(relevent_uri, f)

https://gartic.com.br/Covid20_
https://infectionrank.org/coronavirus/united-states/florida/charlotte-county/
https://www.batteryenergy.com.au/news/battery-energy-update-covid-19
https://teletype.in/@unmesh20/peXZN-27d
https://www.bozeman-lodge.com/senior-living/mt/bozeman/newsroom/weekly-covid-19-update
https://pw.lacounty.gov/covid19-outdoor-permits/
https://www.gardeniacottagecornwall.co.uk/en/2061403/covid-19
https://www.sinksflowershop.com/covid-19-update
http://electronics-trading.com/COVID-19.htm
https://fossencamping.se/bokningsregler.html
https://www.pittlawpc.com/news/What-You-Need-to-Know-About-Sick-Leave-and-FMLA-During-COVID-19_AE395.html?view=4GR22
http://enghunan.gov.cn/hneng/News/Localnews/202011/t20201121_13963639.html
http://www.yizheng21.com/resources/technical-blog/130-an-update-on-covid-19
https://covid-19.ideas2it.com/
http://covidresearchtrials.com/11072020/output/covid/drug/drug409.html
https://www.happymed.tv/the-wall-street-journal-cum-se-transmite-virusul-covi