In [1]:
!pip install wikipedia-api

Collecting wikipedia-api
  Downloading https://files.pythonhosted.org/packages/ef/3d/289963bbf51f8d00cdf7483cdc2baee25ba877e8b4eb72157c47211e3b57/Wikipedia-API-0.5.4.tar.gz
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.5.4-cp36-none-any.whl size=13462 sha256=2ed2d6e5f0dae16ed0c22f7e916d71d523ff80b538b4a973a16dd2c2d2d5cca0
  Stored in directory: /root/.cache/pip/wheels/bf/40/42/ba1d497f3712281b659dd65b566fc868035c859239571a725a
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.5.4


In [2]:
import wikipediaapi  # pip install wikipedia-api
import pandas as pd
import concurrent.futures
from tqdm import tqdm

In [3]:
def wiki_scrape(topic_name, verbose=True):
    def wiki_link(link):
        try:
            page = wiki_api.page(link)
            if page.exists():
                d = {'page': link, 'text': page.text, 'link': page.fullurl,
                     'categories': list(page.categories.keys())}
                return d
        except:
            return None

    wiki_api = wikipediaapi.Wikipedia(language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI)
    page_name = wiki_api.page(topic_name)
    if not page_name.exists():
        print('page {} does not exist'.format(topic_name))
        return
    page_links = list(page_name.links.keys())
    progress = tqdm(desc='Links Scraped', unit='', total=len(page_links)) if verbose else None
    sources = [{'page': topic_name, 'text': page_name.text, 'link': page_name.fullurl,
                'categories': list(page_name.categories.keys())}]
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_link = {executor.submit(wiki_link, link): link for link in page_links}
        for future in concurrent.futures.as_completed(future_link):
            data = future.result()
            progress.update(1) if verbose else None
            if data:
                sources.append(data)
    progress.close() if verbose else None
    blacklist = ('Template', 'Help:', 'Category:', 'Portal:', 'Wikipedia:', 'Talk:')
    sources = pd.DataFrame(sources)
    sources = sources[(len(sources['text']) > 20)
                      & ~(sources['page'].str.startswith(blacklist))]
    sources['categories'] = sources.categories.apply(lambda x: [y[9:] for y in x])
    sources['topic'] = topic_name
    print ('Wikipedia pages scraped:', len(sources))
    return sources

In [4]:
wiki_data = wiki_scrape('COVID')

Links Scraped: 100%|██████████| 1593/1593 [04:07<00:00,  6.45/s]

Wikipedia pages scraped: 1263





In [8]:
print(wiki_data)

                                                page  ...  topic
0                                              COVID  ...  COVID
1     2020 Indonesia large-scale social restrictions  ...  COVID
2             2020 Russia–Saudi Arabia oil price war  ...  COVID
3                                2020 Global Famines  ...  COVID
4          2020 United States anti-lockdown protests  ...  COVID
...                                              ...  ...    ...
1259                                   Zhang Wenhong  ...  COVID
1260                                   Zhong Nanshan  ...  COVID
1261                                        Zoonosis  ...  COVID
1262                                Łukasz Szumowski  ...  COVID
1263                                      Zika virus  ...  COVID

[1264 rows x 5 columns]


In [12]:
wiki_data.to_csv('file_name.csv',columns=['text'])


In [5]:
wiki_data.text

0       Coronavirus disease 2019 (COVID-19) is an infe...
1       Large-scale social restrictions or LSSR (Indon...
2       Beginning in mid-April 2020, there were protes...
3       The 2020 Global Famines, also known as the cor...
4       The Russia–Saudi Arabia oil price war of 2020 ...
                              ...                        
1258    Zhong Nanshan  (Chinese: 钟南山; born 20 October ...
1259    Zika virus (ZIKV)  (pronounced  or ) is a memb...
1260    Zhang Wenhong (Chinese: 张文宏, born 27 August 19...
1261    A zoonosis (plural zoonoses, or zoonotic disea...
1262    Łukasz Jan Szumowski (born 3 June 1972 in Wars...
Name: text, Length: 1263, dtype: object

In [20]:
pip install neuralcoref --no-binary neuralcoref

Collecting neuralcoref
[?25l  Downloading https://files.pythonhosted.org/packages/0c/40/8db3db763077fe80b71859f57731261aeb03cc624635f97a3bcfe55ab37b/neuralcoref-4.0.tar.gz (368kB)
[K     |▉                               | 10kB 18.9MB/s eta 0:00:01[K     |█▊                              | 20kB 1.6MB/s eta 0:00:01[K     |██▋                             | 30kB 2.1MB/s eta 0:00:01[K     |███▌                            | 40kB 2.3MB/s eta 0:00:01[K     |████▍                           | 51kB 1.9MB/s eta 0:00:01[K     |█████▎                          | 61kB 2.1MB/s eta 0:00:01[K     |██████▏                         | 71kB 2.3MB/s eta 0:00:01[K     |███████                         | 81kB 2.6MB/s eta 0:00:01[K     |████████                        | 92kB 2.7MB/s eta 0:00:01[K     |████████▉                       | 102kB 2.5MB/s eta 0:00:01[K     |█████████▊                      | 112kB 2.5MB/s eta 0:00:01[K     |██████████▋                     | 122kB 2.5MB/s eta 0:00:0

In [31]:
pip install spacy



In [32]:
import spacy

In [33]:
print(spacy.__version__) 



2.2.4


In [40]:
import pandas as pd
import re
import spacy
import neuralcoref


  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
100%|██████████| 40155833/40155833 [00:04<00:00, 9000977.61B/s] 


In [41]:
nlp = spacy.load('en_core_web_sm')


def entity_pairs(text, coref=True):
    text = re.sub(r'\n+', '.', text)  # replace multiple newlines with period
    text = re.sub(r'\[\d+\]', ' ', text)  # remove reference numbers
    text = nlp(text)
    if coref:
        text = nlp(text._.coref_resolved)  # resolve coreference clusters
    sentences = [sent.string.strip() for sent in text.sents]  # split text into sentences
    ent_pairs = list()
    for sent in sentences:
        sent = nlp(sent)
        spans = list(sent.ents) + list(sent.noun_chunks)  # collect nodes
        spans = spacy.util.filter_spans(spans)
        with sent.retokenize() as retokenizer:
            [retokenizer.merge(span) for span in spans]
        dep = [token.dep_ for token in sent]
        if (dep.count('obj')+dep.count('dobj'))==1 \
                and (dep.count('subj')+dep.count('nsubj'))==1:
            for token in sent:
                if token.dep_ in ('obj', 'dobj'):  # identify object nodes
                    subject = [w for w in token.head.lefts if w.dep_
                               in ('subj', 'nsubj')]  # identify subject nodes
                    if subject:
                        subject = subject[0]
                        # identify relationship by root dependency
                        relation = [w for w in token.ancestors if w.dep_ == 'ROOT']  
                        if relation:
                            relation = relation[0]
                            # add adposition or particle to relationship
                            if relation.nbor(1).pos_ in ('ADP', 'PART'):  
                                relation = ' '.join((str(relation),
                                        str(relation.nbor(1))))
                        else:
                            relation = 'unknown'
                        subject, subject_type = refine_ent(subject, sent)
                        token, object_type = refine_ent(token, sent)
                        ent_pairs.append([str(subject), str(relation), str(token),
                                str(subject_type), str(object_type)])
    filtered_ent_pairs = [sublist for sublist in ent_pairs
                          if not any(str(x) == '' for x in sublist)]
    pairs = pd.DataFrame(filtered_ent_pairs, columns=['subject',
                         'relation', 'object', 'subject_type',
                         'object_type'])
    print('Entity pairs extracted:', str(len(filtered_ent_pairs)))
    return pairs


def refine_ent(ent, sent):
    unwanted_tokens = (
        'PRON',  # pronouns
        'PART',  # particle
        'DET',  # determiner
        'SCONJ',  # subordinating conjunction
        'PUNCT',  # punctuation
        'SYM',  # symbol
        'X',  # other
        )
    ent_type = ent.ent_type_  # get entity type
    if ent_type == '':
        ent_type = 'NOUN_CHUNK'
        ent = ' '.join(str(t.text) for t in
                nlp(str(ent)) if t.pos_
                not in unwanted_tokens and t.is_stop == False)
    elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
        t = ''
        for i in range(len(sent) - ent.i):
            if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
                t += ' ' + str(ent.nbor(i))
            else:
                ent = t.strip()
                break
    return ent, ent_type


In [None]:
import spacy
import neuralcoref
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)
doc = nlp(u'My sister has a dog. She loves him.')

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [8]:
import spacy


nlp = spacy.load("en_core_web_sm")

text=wiki_data.loc[1,'text']

tagged_text = nlp(text)

extracted_entities = [(i.text, i.label_) for i in tagged_text.ents]

print(extracted_entities)

[('LSSR', 'ORG'), ('Indonesian', 'NORP'), ('Sosial Berskala Besar', 'PERSON'), ('PSBB', 'ORG'), ('Indonesia', 'GPE'), ('the Ministry of Health', 'ORG'), ('20 June 2020', 'DATE'), ('Jakarta', 'GPE'), ('West Java', 'LOC'), ('three', 'CARDINAL'), ('East Nusa', 'PERSON'), ('Tenggara', 'GPE'), ('North Sulawesi', 'GPE'), ('Papua', 'ORG'), ('West Papua', 'PERSON'), ('PSBB', 'ORG'), ('Bali', 'ORG'), ('North Sumatra', 'PERSON'), ('PSBB', 'ORG'), ('first', 'ORDINAL'), ('Indonesia', 'GPE'), ('2 March 2020', 'DATE'), ('two', 'CARDINAL'), ('Depok', 'GPE'), ('West Java', 'GPE'), ('15 March', 'DATE'), ('117', 'CARDINAL'), ('Joko Widodo', 'PERSON'), ('Indonesians', 'NORP'), ('Jakarta', 'GPE'), ('Banten', 'GPE'), ('West Java', 'LOC'), ('the following day', 'DATE'), ('26 March', 'DATE'), ('Dedy Yon Supriyono', 'PERSON'), ('Tegal', 'GPE'), ('first', 'ORDINAL'), ('Central Java', 'ORG'), ('Ganjar Pranowo', 'PERSON'), ('31 March 2020', 'DATE'), ('Jokowi', 'PERSON'), ('the Ministry of Health', 'ORG'), ('Tera