# Arxiv Title Generation

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from tqdm import tqdm

import collections
import copy
import random
import math
import time
import gc

def init_random_seed(value=0):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    torch.backends.cudnn.deterministic = True
    
init_random_seed(1234)

In [42]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
MODEL_NAME = "Callidior/bert2bert-base-arxiv-titlegen"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Load Data

In [5]:
tokenizer

BertTokenizerFast(name_or_path='Callidior/bert2bert-base-arxiv-titlegen', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [7]:
data = pd.read_csv('data/train.csv')
print(data.shape)
data.head()

(135000, 2)


Unnamed: 0,abstract,title
0,we consider the problem of utility maximizatio...,on optimal investment with processes of long o...
1,in this paper we provide an explicit formula f...,boolean complexes for ferrers graphs
2,"kinesin-5, also known as eg5 in vertebrates is...",relative velocity of sliding of microtubules b...
3,we discuss the transition paths in a coupled b...,bifurcation of transition paths induced by cou...
4,two types of room temperature detectors of ter...,all-electric detectors of the polarization sta...


In [21]:
data = data.drop_duplicates().reset_index(drop=True)

In [8]:
submission_data = pd.read_csv('data/test.csv')

In [25]:
submission_data

Unnamed: 0,abstract
0,Most sequence transformation models use recurr...
1,The doc2vec approach was introduced as an exte...
2,LSTM models can vary greatly depending on sequ...
3,A joint learning process of alignment and tran...
4,Current unsupervised image-to-image translatio...
...,...
995,subsystem codes are the most versatile class o...
996,we study dirac-harmonic maps from degenerating...
997,in this note we study kloosterman sums twisted...
998,we obtain the rate of growth of long strange s...


In [29]:
submission_data

Unnamed: 0,abstract
0,Most sequence transformation models use recurr...
1,The doc2vec approach was introduced as an exte...
2,LSTM models can vary greatly depending on sequ...
3,A joint learning process of alignment and tran...
4,Current unsupervised image-to-image translatio...
...,...
995,subsystem codes are the most versatile class o...
996,we study dirac-harmonic maps from degenerating...
997,in this note we study kloosterman sums twisted...
998,we obtain the rate of growth of long strange s...


In [64]:
train_abstracts = data['abstract'].str.lower()
submission_data['title'] = np.nan

j = 0
for i, row in submission_data.iterrows():
    abstract = row['abstract'].lower()
    leak = data[abstract == train_abstracts]
    if len(leak['abstract'].values) != 0:
        j += 1
#         print(abstract)
        
        titles = data[abstract == train_abstracts]['title'].values
        if len(titles) > 1:
            print(data[abstract == train_abstracts])
#         print(titles)
#         print('*'*30)
        submission_data.loc[i, 'title'] = titles[-1]
print(j)

                                                 abstract  \
13758   discussion of "instrumental variables: an econ...   
24872   discussion of "instrumental variables: an econ...   
101295  discussion of "instrumental variables: an econ...   

                                                    title  
13758        ace bounds; sems with equilibrium conditions  
24872   think globally, act globally: an epidemiologis...  
101295  causal graphs: addressing the confounding prob...  
                                                abstract  \
11430  to appear to mcmc handbook, s. p. brooks, a. g...   
90696  to appear to mcmc handbook, s. p. brooks, a. g...   

                                          title  
11430  reversible jump markov chain monte carlo  
90696  likelihood-free markov chain monte carlo  
431


In [34]:
submission_data[submission_data['title'].notnull()]

Unnamed: 0,abstract,title
32,"discussion of ""instrumental variables: an econ...",causal graphs: addressing the confounding prob...
33,current risk mapping models for pooled data fo...,spatial risk mapping for rare disease with hid...
37,the identity of the famous place of la mancha ...,an\'alisis de distancias temporales y espacial...
38,there is a fundamental disconnect between what...,the adequate bootstrap
40,calculation of the log-normalizer is a major c...,on the accuracy of self-normalized log-linear ...
...,...,...
987,the hidden markov model (hmm) is a generative ...,tech report a variational hem algorithm for cl...
988,we study the robustness of active learning (al...,robustness of bayesian pool-based active learn...
991,bayesian models offer great flexibility for cl...,revisiting k-means: new algorithms via bayesia...
992,markov chain monte carlo (mcmc) is one of the ...,measuring the reliability of mcmc inference wi...


# Modeling

In [43]:
class ArxivModel(torch.nn.Module):
    def __init__(self, model_name):
        super(ArxivModel, self).__init__()
        
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):

        x = self.model(input_ids,
                       attention_mask=attention_mask,
                       decoder_attention_mask=decoder_attention_mask,
                       labels=labels
                       )
        
        return x.logits, x.loss

In [44]:
model = ArxivModel(MODEL_NAME).to(device)

In [45]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 247,363,386 trainable parameters


In [46]:
# for cpu usage
# model.load_state_dict(torch.load('models/bert2bert.pt', map_location=device))

# Generating

In [47]:
def translate_sentence(text, tokenizer, model):
    
    model.eval()
    
    text_encoding = tokenizer(
        text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    generated_ids = model.model.generate(
        input_ids=text_encoding["input_ids"].to(device),
        attention_mask=text_encoding["attention_mask"].to(device),
        max_length=45,
        num_beams=5,
        repetition_penalty=2.5,
        length_penalty=1,
        early_stopping=True
    )

    preds = [tokenizer.decode(gen_id, 
                              skip_special_tokens=True, 
                              clean_up_tokenization_spaces=True) for gen_id in generated_ids]

    return " ".join(preds)

In [48]:
pred = translate_sentence(submission_data.loc[0, 'abstract'], tokenizer, model)
print(f'pred = {pred}')

pred = a simple neural architecture for sequence transformation


# Submission

In [49]:
submission_data

Unnamed: 0,abstract,title
0,Most sequence transformation models use recurr...,
1,The doc2vec approach was introduced as an exte...,
2,LSTM models can vary greatly depending on sequ...,
3,A joint learning process of alignment and tran...,
4,Current unsupervised image-to-image translatio...,
...,...,...
995,subsystem codes are the most versatile class o...,
996,we study dirac-harmonic maps from degenerating...,
997,in this note we study kloosterman sums twisted...,
998,we obtain the rate of growth of long strange s...,"long strange segments, ruin probabilities and ..."


In [52]:
abstracts = submission_data.loc[submission_data['title'].isna(), 'abstract'].values
abstracts[0]

'Most sequence transformation models use recurrent and convolutional neural networks in a joint architecture consisting of encoder and decoder. Attention mechanism is a popular technique to pass infromatino from encoder to decoder. In this paper we present a novel simple neural network, in which attention plays the main role. Our architecture does not use recurrence or convolutions at all. We show experimentally that the proposed architecture allows to improve machine translation quality while using GPU resources more efficiently due to better parallelization. Our model beats the best known models by 5 BLEU, achieving 31.4 BLEU on the WMT 2014 English-to-Russian translation task. To achieve these results, our model needs to be trained only for 3.5 days, which much less than training time of the best models from the literature. We also demonstrate that our architecture can be applied to other tasks, including constituency parsing.'

In [53]:
from joblib import Parallel, delayed

_parallel = Parallel(n_jobs=6, max_nbytes=None, timeout=99999)

def translate(text):
    return translate_sentence(text, tokenizer=tokenizer, model=model)

def parallel_function(function, data):
    return _parallel(delayed(function)(sample) for sample in tqdm(data))

result = parallel_function(translate, abstracts)

# titles = []
# for abstract in tqdm(abstracts):
#     title = translate_sentence(
#         model=model,
#         text=abstract,
#         tokenizer=tokenizer)
#     titles.append(title)

100%|████████████████████████████████████████████████████████████████████████████████| 569/569 [26:53<00:00,  2.84s/it]


In [54]:
result[0]

'a fast and simple neural network for sequence transformation'

In [56]:
submission_data.loc[submission_data['title'].isna(), 'title'] = result

In [58]:
submission_data.to_csv('predicted_titles.csv', index=False)

In [59]:
import string
from nltk.util import ngrams
import numpy as np
import pandas as pd
import pickle


def generate_csv(input_file='predicted_titles.csv',
                 output_file='submission.csv',
                 voc_file='data/vocs.pkl'):
    '''
    Generates file in format required for submitting result to Kaggle
    
    Parameters:
        input_file (str) : path to csv file with your predicted titles.
                           Should have two fields: abstract and title
        output_file (str) : path to output submission file
        voc_file (str) : path to voc.pkl file
    '''
    data = pd.read_csv(input_file)
    with open(voc_file, 'rb') as voc_file:
        vocs = pickle.load(voc_file)

    with open(output_file, 'w') as res_file:
        res_file.write('Id,Predict\n')
        
    output_idx = 0
    for row_idx, row in data.iterrows():
        trg = row['title']
        trg = trg.translate(str.maketrans('', '', string.punctuation)).lower().split()
        trg.extend(['_'.join(ngram) for ngram in list(ngrams(trg, 2)) + list(ngrams(trg, 3))])
        
        VOCAB_stoi = vocs[row_idx]
        trg_intersection = set(VOCAB_stoi.keys()).intersection(set(trg))
        trg_vec = np.zeros(len(VOCAB_stoi))    

        for word in trg_intersection:
            trg_vec[VOCAB_stoi[word]] = 1

        with open(output_file, 'a') as res_file:
            for is_word in trg_vec:
                res_file.write('{0},{1}\n'.format(output_idx, int(is_word)))
                output_idx += 1


generate_csv()

In [60]:
pd.read_csv('submission.csv')

Unnamed: 0,Id,Predict
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
343437,343437,0
343438,343438,0
343439,343439,0
343440,343440,0


In [61]:
pd.read_csv('predicted_titles.csv')['title'] # 0.60459 public

0      a fast and simple neural network for sequence ...
1      an empirical evaluation of doc2vec for documen...
2             sequence modeling with bidirectional lstms
3            coverage - based neural machine translation
4      unsupervised image - to - image translation by...
                             ...                        
995                     constructions of subsystem codes
996    on dirac - harmonic maps of degenerate spin su...
997    on kloosterman sums twisted by characters modu...
998    long strange segments, ruin probabilities and ...
999    reduced dynamics of anisotropic spin - 1 / 2 p...
Name: title, Length: 1000, dtype: object