# Installation
Install required libraries

In [None]:
!pip install wikipedia-api wikipedia PyPDF2==2.2.0
!spacy download en_core_web_md

In [None]:
import wikipediaapi, calendar, spacy, nltk, os, re, en_core_web_md
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

nltk.download(['punkt','stopwords','wordnet','omw-1.4'])

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PyPDF2 import PdfReader
import wikipedia as wiki
wp = wikipediaapi.Wikipedia('en')
nlp = en_core_web_md.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


# Define functions

In [None]:
def calculate_jaccard(text1,text2):  # calculates jaccard similarity between two string
  word_tokens1=word_tokenize(text1.lower())
  word_tokens2=word_tokenize(text2.lower())
  both_tokens = word_tokens1 + word_tokens2
  union = set(both_tokens)
  # Calculate intersection.
  intersection = set()
  for w in word_tokens1:
    if w in word_tokens2:
      intersection.add(w)
  jaccard_score = len(intersection)/len(union)
  return jaccard_score

In [None]:
def stemlemma(text):
  return ' '.join([stemmer.stem(wordnet_lemmatizer.lemmatize(word)) for word in word_tokenize(text.lower())])

In [None]:
def openFiles(files,path):
  li=[]
  for f in files:
    with open(path+f,"r") as tf:
      li.append(tf.read().replace('\n', ''))
  return li

## Extract Keywords

In [None]:
# get all nouns and noun phrases from the input sentence
def getAllNPsFromSent(sent,include_nouns=False):
    npstr=[]
    chunks = list(sent.noun_chunks)
    for i in range(len(chunks)):
        np=chunks[i]
        if len(np)==1:
            if np[0].pos_!="NOUN":
                continue
        if np.text.lower() not in npstr:
            npstr.append(np.text.lower())      
        if i < len(chunks)-1:
            np1=chunks[i+1]
            if np1.start-np.end==1:
                if sent.doc[np.end].tag_=="CC":
                    newnp = sent[np.start:np1.end]
                    if newnp.text.lower() not in npstr:
                        npstr.append(newnp.text.lower())
    if include_nouns:
        for t in sent:
            if "subj" in t.dep_ and t.pos_=="NOUN": 
                if t.text.lower() not in npstr:
                    npstr.append(t.text.lower())
    return npstr   

In [None]:
def getTopK(di,K=50):
  tempdf=pd.DataFrame.from_dict(di,columns=["tfidf"], orient='index')
  return list(tempdf.sort_values(by=['tfidf'],ascending=False)[:K].index)

def getKeywords(doc,nlp,include_nouns=False,tfidf=[],K=None): # K: free parameter
  keywords=[]
  for s in doc.split('\n'):
    s=nlp(s)
    keywords.extend([n.text for n in list(s.ents)])
    keywords.extend(list(getAllNPsFromSent(s,include_nouns)))
  
  keywords=list(set(keywords))

  keywords_wn={}

  if len(tfidf)>0:
    tfidf_threshold=np.mean([t for t in tfidf if t>0])

  for k in keywords:
    keyword=' '.join([word for word in word_tokenize(k) if not word.lower() in stopwords.words('english')])
    if not wn.synsets(keyword) and keyword.replace(' ','').isalpha() and not keyword.isupper() and not np.array([k.isupper() for k in [ky[:-1] for ky in keyword.split()]]).any():
      keyword=keyword.lower()
      if len(tfidf)>0:
        if stemlemma(keyword) in tfidf.index:# and len(keyword)>2:
          if tfidf[stemlemma(keyword)]>tfidf_threshold:
            if keyword not in keywords_wn:
              keywords_wn[keyword]=tfidf[stemlemma(keyword)]
            else:
              keywords_wn[keyword]=max(keywords_wn[keyword],tfidf[stemlemma(keyword)])
      else:
        keywords_wn[keyword]=0

  if K and len(tfidf)>0:
    return getTopK(keywords_wn,K=K)

  else:
    return list(keywords_wn.keys())

## Build TFIDF matrix:
If you have a set of documents of domains you can build a TFIDF matrix to  enhance the keyword extraction

In [None]:
def buildTFIDFvector(docs,use_ngrams=True,ngrams=4):
  if use_ngrams:
    vectorizer = TfidfVectorizer(ngram_range=(1,ngrams),min_df=0,stop_words=stopwords.words('english'))
  else:
    vectorizer = TfidfVectorizer(min_df=0,stop_words=stopwords.words('english'))
  vectors = vectorizer.fit_transform(docs)
  return pd.DataFrame(vectors.todense().tolist(), columns=vectorizer.get_feature_names_out())
def buildTFIDF(domains,files,use_ngrams=True,ngrams=3):
  docs={}
  for d in domains:
    docs[d]=stemlemma(' '.join([files[doc] for doc in domains[d]]))
  return buildTFIDFvector(list(docs.values()),use_ngrams=use_ngrams,ngrams=ngrams)

def getTFIDFscore(q,id,tfidf):
  score=0
  for t in q.split():
    if t in tfidf[q].columns:
      score+=tfidf[q][t][id]
  return score  

## Get corpus

In [None]:
def getCorpus(list_of_keywords, title_similarity=False,sim_threshold=0.5,filtered_cats=[], auto_suggest = False, depth=1,verbose=False):
  processed_titles=[] # we store processed titles of wikipedia articles to avoid processing them more than once
  corpus= [] # here we store extracted articles
  for keyword in tqdm(list_of_keywords, disable= verbose):
    if not verbose:
      print('=== Processing keyword:',keyword, end='\r')
    # we search for the closest titles matching our keyword
    if auto_suggest:
      matching_titles=wiki.search(keyword,suggestion=True)  
      if not matching_titles:
        continue
      for title in matching_titles:
        # you can add a similarity criteria between the keyword and the matching article before proceeding
        # for example use jaccard with a threshold: if calculate_jaccard(keyword,title)>0.5
        if title not in processed_titles:
          if title_similarity:
            if calculate_jaccard(title,keyword)<sim_threshold:
              continue
          corpus.extend(getCorpusFromTitle(title,filtered_cats,depth=depth,verbose=verbose))
          title.append(processed_titles)
    else:
      corpus.extend(getCorpusFromTitle(keyword,filtered_cats,depth=depth,verbose=verbose))
  return list(set(corpus))

In [None]:
def getCorpusFromTitle(title,filtered_cats,depth=1,verbose=False):
  corpus= [] # here we store extracted articles
  page = wp.page(title) # get the page that matches the title
  if page:
    try:
      try:
        corpus.append(page.text) # add matched page to the corpus list
      except:
        corpus.append(wiki.page(wiki.search(title)[0]).content)
    except:
      None
    # browse the categories of the page
    try:
      cats=page.categories
    except:
      return corpus
    if depth>0:
      if len(cats)<50: #max number of categories
        for cat_title, category in cats.items(): # There are some generic categories that we want to filter out (e.g, Category:articles from August 2019).
          if not match(cat_title, filtered_cats): 
            if verbose:
              print("\t== Extracting articles from the category:",cat_title)
            # depth=1 get all articles in each category, depth=2: include the articles in subcategories, depth=3: include the articles in subsubcategories. 
            corpus.extend(get_articles_in_category(category,filtered_cats, max_level=depth,verbose=verbose)) 
  return list(set(corpus))

def match(title,filters):
  for filter in filters:
    if filter.lower() in title.lower():
      return True
  return False

In [None]:
def get_articles_in_category(category,filtered_cats, level=1, max_level=2,maxlimit=200, verbose=False):
  articles=[]
  try:
    categorymembers=category.categorymembers
    if len(categorymembers)>maxlimit:
      return articles
    for cat_title, c in tqdm(categorymembers.items(), disable=not verbose):

      if c.ns != wikipediaapi.Namespace.CATEGORY:
        try:
          articles.append(c.text)
        except:
          articles.append(wiki.page(cat_title).content)
      elif level < max_level and not match(cat_title, filtered_cats):
          articles.extend(get_articles_in_category(c,filtered_cats, level=level + 1, max_level=max_level))
  except Exception as e: 
    print(e)
  return articles

In [None]:
def saveCorpus(docs,parent_dir,folder='Corpus'):
  for i in range(0,len(docs)):
    doc=docs[i]
    path = os.path.join(parent_dir, folder)
    os.mkdir(path)
    filename='doc'+str(i)+'.txt'
    filepath = os.path.join(path, filename)
    text_file = open(filepath, "w")
    n = text_file.write(doc)
    text_file.close()

In [None]:
def createWordCloud(corpus,image_name='Word Cloud'):
  WC=WordCloud(stopwords=set(nlp.Defaults.stop_words), #width = 1000, height = 500,
                        max_font_size=50, max_words=100,background_color="white")
  wordcloud = (WC.generate(' '.join(corpus)))

  plt.figure(figsize=(15,8))
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.axis("off")
  plt.savefig(image_name+".png", bbox_inches='tight')
  plt.show()
  plt.close()

In [None]:
def getTotal_nbr_words(corpus): 
  total_nbr_words=0
  for article in corpus:
    total_nbr_words+= len(word_tokenize(article))
  print("total number of words:", total_nbr_words)
  return total_nbr_words

In [None]:
def readPDF(file):
  reader = PdfReader(file)
  return ' '.join([re.sub(r"\s+", " ",page.extract_text().replace('\n',' ')).strip() for page in reader.pages])

Similarity check

In [None]:
def simCheck(doc,corpus,nlp=nlp):
  doc = nlp(doc)
  c=[]
  for article in corpus:
    sim=doc.similarity(nlp(article))
    if sim>0:
      c.append(sim)
  # print(min(c),np.average(c),max(c))
  return min(c),np.average(c),max(c)

In [None]:
def simCheckV2(doc1,doc2,corpus,nlp=nlp):
  doc1 = nlp(doc1)
  doc2 = nlp(doc2)
  c1=[]
  c2=[]
  for article in tqdm(corpus):
    article=nlp(article)
    sim1=doc1.similarity(article)
    sim2=doc2.similarity(article)
    if sim1>0:
      c1.append(sim1)
    if sim2>0:
      c2.append(sim2)
  c3=c1+c2
  # print(min(c1),np.average(c1),max(c1))
  # print(min(c2),np.average(c2),max(c2))
  print(min(c3),np.average(c3),max(c3))

In [None]:
def docSimilarity(doc1,doc2,nlp):
  return nlp(doc1).similarity(nlp(doc2))

# Usage example

creating some rules to exclude generic categories based on our observations

In [None]:
filters=['Articles','Commons','WikiData','Wikipedia','Webarchive','disputes','bot:','CS1','errors','pages','births','deaths','disambiguation','elements']+[calendar.month_name[i] for i in range(1,12)]+[calendar.month_abbr[i] for i in range(1,12)]

let's download the PURE dataset

In [None]:
!wget -q https://zenodo.org/record/1414117/files/requirements.zip
!unzip -q requirements.zip

We experiment with these documents from **railway** and **transportation** domains (according to this table http://nlreqdataset.isti.cnr.it/)

In [None]:
docs={'railway':['2007 - ertms.pdf','2006 - eirene sys 15.pdf','2007 - eirene fun 7.pdf'],
      'transportation':['2001 - ctc network.pdf','2005 - pontis.pdf','2007 - mdot.pdf']}

In [None]:
files={doc:readPDF('req/'+doc) for docset in docs.values() for doc in docset}

In [None]:
tfidf=buildTFIDF(docs,files)

In [None]:
tfidf_ids={'railway':0,'transportation':1}

In [None]:
import pickle
def saveObj(ob,filename):
    filehandler = open(filename, 'wb') 
    pickle.dump(ob, filehandler)
    filehandler.close()

NOTE: This step takes time to run

In [None]:
keywords_set={}
corpora={}
for dom,v in docs.items():
  print('#### domain:'+dom)
  docname=v[0]
  print('# doc:'+docname)
  doc=files[docname]
  keywords = getKeywords(doc,nlp,include_nouns=True,tfidf=tfidf.loc[tfidf_ids[dom]],K=50) # extract keywords
  keywords_set[docname]=keywords
  print(len(keywords),' keywords extracted ####')
  corpus = getCorpus(keywords, title_similarity=True, filtered_cats=filters, auto_suggest = False, depth=1,verbose=True)
  saveObj(corpus,dom+'.corpus')
  corpora[docname]=corpus
  print('### number of articles:',len(corpus))
  simCheckV2(files[v[1]],files[v[2]],corpus)
  getTotal_nbr_words(corpus)
  createWordCloud(corpus,dom+'WC')

In [None]:
saveObj(corpora,"corpora.obj")