# Selection based on Wikipedia Events

In [1]:
import os
import re
import pke
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
ngram, top_n_keywords=3, 15
# please define your own elastic_curl_command (replace XXXnytcorpus_commandXXX if you use ElasticSearch)
elastic_curl_command=r"""curl -u XXXnytcorpus_commandXXX?scroll=10m&size=25' ¥ -H 'Content-Type: application/json' ¥ --data-raw '{"query": {"simple_query_string":{"query":"query_keywords_string","fields" : ["body"]}}}'"""

In [2]:
def yake_keyword_extraction(sentence_text,stoplist,ngram=3,top_n_keywords=15):
    # 1. create a YAKE extractor.
    extractor = pke.unsupervised.YAKE()
    extractor.stoplist = stoplist
    # 2. load the content of the document.
    extractor.load_document(input=sentence_text,language='en',normalization=None)
    # 3. select {1-3}-grams not containing punctuation marks and not beginning/ending with a stopword as candidates.
    extractor.candidate_selection(n=ngram)
    # 4. weight the candidates using YAKE weighting scheme, a window (in words) for computing left/right contexts can be specified.
    window = 2
    use_stems = False
    extractor.candidate_weighting(window=window,use_stems=use_stems)
    # 5. get the 10-highest scored candidates as keyphrases.
    # redundant keyphrases are removed from the output using levenshtein distance and a threshold.
    threshold = 0.8
    keyphrases = extractor.get_n_best(n=top_n_keywords, threshold=threshold)
    keywords_list=[]
    for keyword_val_tuple in keyphrases:
        keywords_list.append(keyword_val_tuple[0])
    return keywords_list

def return_es_result_or(curl_command_string,key_w):
    add_key_string=""
    for k in key_w:
        add_key_string=add_key_string+r"\""+k+r"\"|"
    add_key_string=add_key_string[:-2]+"\""
    curl_command=curl_command_string.replace("XXX",add_key_string)
    es_result = os.popen(curl_command).readlines()
    doc_info_list=[]
    if len(es_result)==0:
        return doc_info_list
    for doc_text in es_result[0].split(r'_type":"_doc')[1:]:
        doc_info=re.findall(r'_id\":\"(\d+)\","_score\":(\d+\.\d+).*"body":".*","articleAbstract".*"publicationDate":"(\d+-\d+-\d+)"',doc_text)
        doc_info_list.extend(list(doc_info))
    return doc_info_list

In [3]:
wiki_event_list=pickle.load(open("data/Wiki_WebPage.pickle", "rb"))
print(len(wiki_event_list))
pd.DataFrame(wiki_event_list, columns=["No.","Time","Event-Text"]).head(5)

2733


Unnamed: 0,No.,Time,Event-Text
0,0,1987-01-02,Chadian–Libyan conflict – Battle of Fada: The ...
1,1,1987-01-03,Aretha Franklin becomes the first woman induct...
2,2,1987-01-04,1987 Maryland train collision: An Amtrak train...
3,3,1987-01-05,U.S. President Ronald Reagan undergoes prostat...
4,4,1987-01-08,The Dow Jones Industrial Average closes for th...


In [4]:
for i,r in enumerate(wiki_event_list):
    event_text=r[2]
    keywords_list=yake_keyword_extraction(event_text,stoplist,ngram,top_n_keywords)
    doc_info=return_es_result_or(elastic_curl_command,keywords_list)    
    r.append(keywords_list)
    r.append(doc_info)
pd.DataFrame(wiki_event_list, columns=["No.","Time","Event-Text", "Keywords", "Doc-Info"]).head(5)
# The last column is the doc_info that obtained before
# Each element in doc_info is a triple, e.g. (853, 77.85112, 1987-01-04), representing (doc-id, BM25-score, doc-timestamp)
# The article text can then be easily obtained by using doc-id, which can then be used to generate questions

Unnamed: 0,No.,Time,Event-Text,Keywords,Doc-Info
0,0,1987-01-02,Chadian–Libyan conflict – Battle of Fada: The ...,"[libyan armoured brigade, chadian army destroy...","[(853, 77.85112, 1987-01-04), (1454, 74.34511,..."
1,1,1987-01-03,Aretha Franklin becomes the first woman induct...,"[roll hall, aretha franklin, woman inducted, f...","[(1149404, 92.39003, 1999-10-31), (784991, 72...."
2,2,1987-01-04,1987 Maryland train collision: An Amtrak train...,"[1987 maryland train, maryland train collision...","[(26029, 71.2636, 1987-03-31), (1451, 68.56541..."
3,3,1987-01-05,U.S. President Ronald Reagan undergoes prostat...,"[president ronald reagan, ronald reagan underg...","[(258093, 59.275208, 1989-06-12), (386160, 58...."
4,4,1987-01-08,The Dow Jones Industrial Average closes for th...,"[dow jones industrial, jones industrial averag...","[(159199, 112.70204, 1988-07-06), (57629, 109...."


In [5]:
#store the wiki_event_list
pickle.dump(wiki_event_list, open("data/wikievent_docinfo_list.pickle", "wb"))