### Neural language models or how to write scientific papers

We shall train our language model on a corpora of [ArXiv](http://arxiv.org/) articles and see if we can generate a new one!

_data by neelshah18 from [here](https://www.kaggle.com/neelshah18/arxivdataset/)_

_Disclaimer: this has nothing to do with actual science. But it's fun, so who cares?!_

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import pandas as pd
from collections import Counter

In [None]:
!wget "https://www.dropbox.com/s/99az9n1b57qkd9j/arxivData.json.tar.gz?dl=1" -O arxivData.json.tar.gz
!tar -xvzf arxivData.json.tar.gz

--2021-06-24 13:38:49--  https://www.dropbox.com/s/99az9n1b57qkd9j/arxivData.json.tar.gz?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.18, 2620:100:6032:18::a27d:5212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/99az9n1b57qkd9j/arxivData.json.tar.gz [following]
--2021-06-24 13:38:49--  https://www.dropbox.com/s/dl/99az9n1b57qkd9j/arxivData.json.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc234a80f5a562a37606e4056af0.dl.dropboxusercontent.com/cd/0/get/BRAm3AKwRq4uCnhWBFnh9lOvpGwr_z70awGGo3_3aGeq5fYitNfEol54mOFOEa0b8k-XyZV2fYxQXMIjRta46sSJOraFZNeTAXrYktxngaTxcabgTPl-0_4hklTxw6s2GOiTTWwA7Tcgar_urKy5Mjxl/file?dl=1# [following]
--2021-06-24 13:38:50--  https://uc234a80f5a562a37606e4056af0.dl.dropboxusercontent.com/cd/0/get/BRAm3AKwRq4uCnhWBFnh9lOvpGwr_z70awGGo3_3aGeq5fYitNfEol54mO

In [None]:
data = pd.read_json("./arxivData.json")
data.sample(n=5)

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
37415,"[{'name': 'Rafael Peñaloza'}, {'name': 'Nico P...",10,1706.03207v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We present a probabilistic extension of the de...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Towards Statistical Reasoning in Description L...,2017
20675,"[{'name': 'Sixue Liu'}, {'name': 'Yulong Ceng'...",3,1610.00442v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",10,Many real-world problems involving constraints...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",A Probability Distribution Strategy with Effic...,2016
3153,"[{'name': 'Liping Wang'}, {'name': 'Songcan Ch...",16,1303.3987v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,"Recently, $l_{2,1}$ matrix norm has been widel...","[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...","$l_{2,p}$ Matrix Norm and Its Application in F...",2013
39406,"[{'name': 'Zhenhao Ge'}, {'name': 'Sudhendu R....",25,1602.08132v1,"[{'rel': 'related', 'href': 'http://dx.doi.org...",2,Systems based on automatic speech recognition ...,"[{'term': 'cs.SD', 'scheme': 'http://arxiv.org...",Adaptive Frequency Cepstral Coefficients for W...,2016
13432,"[{'name': 'Jorge Gomes'}, {'name': 'Paulo Urba...",11,1304.3362v1,"[{'rel': 'related', 'href': 'http://dx.doi.org...",4,Novelty search is a recent artificial evolutio...,"[{'term': 'cs.NE', 'scheme': 'http://arxiv.org...",Evolution of Swarm Robotics Systems with Novel...,2013


In [None]:
# assemble lines: concatenate title and description
lines = data.apply(lambda row: row['title'] + ' ; ' + row['summary'], axis=1).tolist()

sorted(lines, key=len)[:3]

['Differential Contrastive Divergence ; This paper has been retracted.',
 'What Does Artificial Life Tell Us About Death? ; Short philosophical essay',
 'P=NP ; We claim to resolve the P=?NP problem via a formal argument for P=NP.']

In [None]:
SEQ_LEN = 4
BATCH_SIZE = 128
EPOCH = 2

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        lines,
        seq_length
    ):
        self.lines = lines
        self.seq_length = seq_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        train_df = pd.DataFrame(self.lines, columns=['text'])
        text = train_df['text'].str.cat(sep=' ')
        return text.split(' ')

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.seq_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.seq_length]),
            torch.tensor(self.words_indexes[index+1:index+self.seq_length+1]),
        )

In [None]:
dataset = Dataset(lines, SEQ_LEN)

In [None]:
[dataset.index_to_word[x.item()] for x in dataset[1][0]], [dataset.index_to_word[x.item()] for x in dataset[1][1]]

(['Recurrent', 'Attention', 'Units', 'for'],
 ['Attention', 'Units', 'for', 'Visual'])

In [None]:
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

In [None]:
model = Model(dataset)
model = model.to(device)

In [None]:
import argparse
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader

def train(dataset, model):
    model.train()

    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=10, verbose=True)

    for epoch in range(EPOCH):
        state_h, state_c = model.init_state(4)

        state_h = state_h.to(device)
        state_c = state_c.to(device)
        batch_loss = []
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()

            x = x.to(device)
            y = y.to(device)

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            batch_loss.append(loss.item())
            if batch % 100 == 0:
                avg_loss = np.mean(batch_loss)
                batch_loss = []
                scheduler.step(avg_loss)
                print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item(), 'average loss:': avg_loss })

In [None]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    state_h = state_h.to(device)
    state_c = state_c.to(device)

    with torch.no_grad():
        for i in range(0, next_words):
            x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
            x = x.to(device)
            
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            print(y_pred)
            last_word_logits = y_pred[0][-1]
            p = torch.nn.functional.softmax(last_word_logits, dim=0).cpu().detach().numpy()
            word_index = np.random.choice(len(last_word_logits), p=p)
            words.append(dataset.index_to_word[word_index])

    return ' '.join(words)

In [None]:
# train(dataset, model)

In [None]:
print(predict(dataset, model, text='AI'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          -1.8309e-02, -1.6587e-02, -1.8167e-02, -3.1310e-02, -8.7749e-02,
          -4.0919e-02,  7.0193e-02, -7.9428e-02, -6.0534e-02,  1.8743e-02,
           4.7856e-02, -2.5651e-02, -5.9431e-02, -5.6141e-02, -6.5909e-02,
           3.6551e-02, -1.3061e-02,  2.0424e-02, -5.0337e-02, -8.4183e-02,
           2.4710e-02, -1.0087e-02, -3.1076e-02]]], device='cuda:0'), tensor([[[ 6.0229e-02,  2.8228e-01,  1.3399e-02, -5.2058e-01, -1.7733e-01,
          -8.0862e-01,  4.8193e-01, -6.5580e-02,  8.8645e-01, -2.9801e-01,
          -9.2851e-02, -2.9384e-01,  6.9332e-02, -2.0814e-01, -3.4839e-01,
          -6.9147e-02, -1.9187e-01,  4.0973e-01,  3.2989e-01,  4.0652e-02,
          -3.0707e-01, -3.1824e-01,  8.9644e-02,  7.2881e-02, -4.6674e-02,
          -5.3505e-02,  4.3830e-01, -1.1939e-01,  6.8103e-01,  2.7154e-01,
          -5.1538e-01,  1.3743e-01, -4.0200e-01,  3.7656e-01,  2.3027e-01,
          -1.8714e-01, -1.6122e-01,  5.3