[https://neptune.ai/blog/web-scraping-and-knowledge-graphs-machine-learning]

In [1]:
! pip install wikipedia-api

Collecting wikipedia-api
  Using cached Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0


In [6]:
import wikipediaapi
import pandas as pd
import concurrent.futures
from tqdm import tqdm

import spacy
import pandas as pd
import requests

In [7]:
def scrape_wikipedia(name_topic, verbose=True, user_agent='Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'):
    def link_to_wikipedia(link):
        try:
            page = api_wikipedia.page(link)
            if page.exists():
                return {'page': link, 'text': page.text, 'link': page.fullurl, 'categories': list(page.categories.keys())}
        except:
            return None

    api_wikipedia = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.WIKI, user_agent=user_agent)
    name_of_page = api_wikipedia.page(name_topic)
    if not name_of_page.exists():
        print('Page {} is not present'.format(name_of_page))
        return

    links_to_page = list(name_of_page.links.keys())
    procceed = tqdm(desc='Scraped links', unit='', total=len(links_to_page)) if verbose else None
    origin = [{'page': name_topic, 'text': name_of_page.text, 'link': name_of_page.fullurl, 'categories': list(name_of_page.categories.keys())}]

    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        links_future = {executor.submit(link_to_wikipedia, link): link for link in links_to_page}
        for future in concurrent.futures.as_completed(links_future):
            info = future.result()
            origin.append(info) if info else None
            procceed.update(1) if verbose else None
    procceed.close() if verbose else None

    namespaces = ('Wikipedia', 'Special', 'Talk', 'LyricWiki', 'File', 'MediaWiki',
                 'Template', 'Help', 'User', 'Category talk', 'Portal talk')
    origin = pd.DataFrame(origin)
    origin = origin[(len(origin['text']) > 20)
                     & ~(origin['page'].str.startswith(namespaces, na=True))]
    origin['categories'] = origin.categories.apply(lambda a: [b[9:] for b in a])

    origin['topic'] = name_topic
    print('Scraped pages', len(origin))

    return origin

In [8]:
data_wikipedia = scrape_wikipedia('COVID 19')

Scraped links: 100%|██████████████████████████| 2389/2389 [02:39<00:00, 14.95/s]

Scraped pages 2138





In [11]:
data_wikipedia.to_csv('data/scraped_data.csv')

In [15]:
# ! python -m spacy download en_core_web_sm

In [16]:
from spacy import displacy
from spacy.tokens import Span
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

In [17]:
import networkx as ntx
%matplotlib inline

In [19]:
def extract_entities(sents):
    # chunk one
    enti_one = ""
    enti_two = ""

    dep_prev_token = "" # dependency tag of previous token in sentence

    txt_prev_token = "" # previous token in sentence

    prefix = ""
    modifier = ""



    for tokn in nlp(sents):
       # chunk two
       ## move to next token if token is punctuation

        if tokn.dep_ != "punct":
            #  check if token is compound word or not
            if tokn.dep_ == "compound":
                prefix = tokn.text
                # add the current word to it if the previous word is 'compound’
                if dep_prev_token == "compound":
                    prefix = txt_prev_token + " "+ tokn.text

            # verify if token is modifier or not
            if tokn.dep_.endswith("mod") == True:
                modifier = tokn.text
                # add it to the current word if the previous word is 'compound'
                if dep_prev_token == "compound":
                    modifier = txt_prev_token + " "+ tokn.text

            # chunk3
            if tokn.dep_.find("subj") == True:
                enti_one = modifier +" "+ prefix + " "+ tokn.text
                prefix = ""
                modifier = ""
                dep_prev_token = ""
                txt_prev_token = ""

            # chunk4
            if tokn.dep_.find("obj") == True:
                enti_two = modifier +" "+ prefix +" "+ tokn.text

            # chunk 5
            # update variable
            dep_prev_token = tokn.dep_
            txt_prev_token = tokn.text

    return [enti_one.strip(), enti_two.strip()]

In [21]:
pairs_of_entities = []
for i in tqdm(data_wikipedia['text'][:10]):
    pairs_of_entities.append(extract_entities(i))

100%|███████████████████████████████████████████| 10/10 [00:15<00:00,  1.55s/it]


In [23]:
pairs_of_entities

[['2019 Novel Coronavirus', 'COVID syndrome assessment clinics'],
 ['= = =', '= = Notes'],
 ['who', 'Kary Cookies'],
 ['Tablighi thousands', 'Official Nizamuddin Markaz'],
 ['relevant SARS that', 'time'],
 ['Bara Kahu', 'pandemic Malaysia Pakistan'],
 ['stock  market', 'pandemic stock market Russia'],
 ['chimeric protein candidate', 'clinical  trials'],
 ['medical  who', '= = ='],
 ['August Internet method', 'Oral Education education']]

In [33]:
def obtain_relation(sent):
    doc = nlp(sent)

    matcher = Matcher(nlp.vocab)

    pattern = [{'DEP':'ROOT'},
           {'DEP':'prep','OP':"?"},
           {'DEP':'agent','OP':"?"}, 
           {'POS':'ADJ','OP':"?"}]

    matcher.add("matching_1", [pattern])

    matcher = matcher(doc)
    h = len(matcher) - 1

    span = doc[matcher[h][1]:matcher[h][2]]

    return (span.text)

In [35]:
obtain_relation(data_wikipedia['text'][1])

'='

In [36]:
data_wikipedia['text'][1]

'A Tablighi Jamaat religious conference that took place at the "Masjid Jamek Sri Petaling" in Kuala Lumpur\'s Sri Petaling district between 27 February to 1 March 2020 became a COVID-19 super-spreader event with more than 3,300 cases being linked to the event. By 19 May 2020, the Malaysian Director-General of Health Noor Hisham Abdullah confirmed that 48% of the country\'s COVID-19 cases (3,347) had been linked to the Kuala Lumpur Tablighi Jamaat cluster. Additionally, nearly 10% of attendees were overseas visitors, causing COVID-19 to spread to other countries in Southeast Asia. On 8 July 2020, this cluster was declared over by the Ministry of Health.Although much more widespread, the Tabligh event was not the first wave of coronavirus in Malaysia.\n\nEvent\nBetween 27 February and 1 March 2020, the Tablighi Jamaat movement organised an international conference at the "Masjid Jamek Sri Petaling" in Sri Petaling, Kuala Lumpur in Malaysia.  The religious gathering was attended by approx