In [2]:
import sys
!{sys.executable} -m nltk.downloader stopwords
!{sys.executable} -m nltk.downloader universal_tagset
!{sys.executable} -m spacy download en

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
Collecting en_core_web_sm==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 14.0 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.1.0-py3-none-any.whl size=11074433 sha256=39432439877201c835fca8ac573ae071ac583acb588f9b19f6cf72dbb45a2fb1
  Stored in directory: /tmp/pip-ephem-whee

In [3]:
import os
import re
import pandas as pd
import torch
import nltk
from tqdm import tqdm
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration

> Some commands you can try if you meet import issues of transformers      
1. conda install -c conda-forge ipywidgets    
2. conda install -n base -c conda-forge jupyterlab_widgets     
3. jupyter nbextension enable --py widgetsnbextension    
4. pip install --upgrade jupyter_client
5. from ipywidgets import IntProgress

In [4]:
# Load DocT5Query model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
model = T5ForConditionalGeneration.from_pretrained('castorini/doc2query-t5-base-msmarco')
model.to(device)

Downloading:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [5]:
def cleantext(doc):
    doc = re.sub('<a href[^<]+>', "",doc)
    doc = re.sub('</[^<]+>',"",doc)
    doc = re.sub('</[^<]+>',"",doc)
    doc = re.sub("\\[","",doc)
    doc = re.sub("\\]","",doc)
    doc = re.sub("\\(.*?\\)","",doc)
    doc = re.sub("\n+"," ",doc)
    return doc

In [6]:
def predict_query(chunk,top_k=10):
    
    num = len(chunk)
    topk = top_k // num
    queries = []
    for doc in chunk:
        input_ids = tokenizer.encode(doc, return_tensors='pt').to(device)
        outputs = model.generate(
            input_ids=input_ids,
            max_length=512,
            do_sample=True,
            top_k=topk,
            num_return_sequences=topk)
        #print(outputs.shape)
        for i in range(topk):
            queries.append(tokenizer.decode(outputs[i], skip_special_tokens=True))         
    return queries[:top_k]

In [9]:
import string
import pke
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
import spacy
from spacy.lang.en import English

def MultiExtractor(doc, n_best=10):
    # 1. create a MultipartiteRank extractor.
    extractor2 = pke.unsupervised.MultipartiteRank()
    # 2. load the content of the document.
    extractor2.load_document(input=doc)
    # 3. select the longest sequences of nouns and adjectives, that do
    #    not contain punctuation marks or stopwords as candidates.
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    extractor2.candidate_selection(pos=pos, stoplist=stoplist)
    # 4. build the Multipartite graph and rank candidates using random walk,
    #    alpha controls the weight adjustment mechanism, see TopicRank for
    #    threshold/method parameters.
    extractor2.candidate_weighting(alpha=1.1,
                                  threshold=0.74,
                                  method='average')
    # 5. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor2.get_n_best(n=n_best)
    
    return keyphrases

## test
#a = MultiExtractor(doc)

In [10]:
# Split documents to several paragraphs 
# Each paragraph's are round 512 tokens
# This step is for later transformer-based models
def get_chunks(doc, maxlength):
    
    def turnstring(tokens):
        doc=''
        for tok in tokens:
            doc+=" "+tok
        return doc.strip()
    
    corpus = cleantext(doc)
    tokens = nltk.tokenize.word_tokenize(corpus)
    length = len(tokens)
    if length <= maxlength:
        return [corpus]
    else:
        res = []
        start = 0
        end = maxlength
        while end<=length:
            res.append(turnstring(tokens[start:end]))
            start += maxlength
            end += maxlength
        # res.append(turnstring(tokens[start:length]))
        return res

In [16]:
datapath = '../Data/WapoV4_irdataset/'
expanded_path = '../Data/WapoV4_expanded/'
def loadfiles(datapath,expanded_path):
    files = os.listdir(datapath)
    try:
        files.remove('.ipynb_checkpoints')
    except:
        pass
    print("We have files %s to be expanded."%len(files))

    done = os.listdir(expanded_path)
    try:
        done.remove('.ipynb_checkpoints')
    except:
        pass
    print("Done: ",len(done))

    print("Total: ",len(files),"\n","already done:",len(done))
    for file in done:
        try:
            files.remove(file)
        except:
            print(file)
    print("Still have %s files to be expanded... "%len(files))
    
    return files

FilesToBeExpanded = loadfiles(datapath,expanded_path)

We have files 3218 to be expanded.
Done:  0
Total:  3218 
 already done: 0
Still have 3218 files to be expanded... 


In [20]:
# the process can be interupted if your gpu memories are not enough or because of other reasons
# just repeat the above step to reload files
# this ensure you will not overlap the expanded files 
def ExpandCorpus(FilesToBeExpanded,datapath,expanded_path):
    for file in tqdm(FilesToBeExpanded):
        try:
            df = pd.read_csv(datapath+file)
        except:
            continue
        df['Query'] = ''
        df['Key_Words'] = ''
        total = 0
        for i in range(len(df)):
            try:
                chunks =get_chunks(cleantext(df['body'][i]), 512)
                df.at[i,'Query'] = predict_query(chunks,10)
                df.at[i,'Key_Words'] = [phrase for phrase,score in MultiExtractor(cleantext(df['body'][i]))]
                total += 1
            except:
                pass

        df.to_csv(expanded_path+file,index=False)
    print("%s files have been expanded."%total)

In [23]:
#test
#ExpandCorpus(FilesToBeExpanded[:1],datapath,expanded_path)
ExpandCorpus(FilesToBeExpanded,datapath,expanded_path)

  0%|          | 0/1 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (697 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1/1 [45:58<00:00, 2758.15s/it]

198 files have been expanded.



