In [1]:
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

In [3]:
tokenize('$100')

['$', '100']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('ORGANIC')

'organ'

In [6]:
def bag_of_words(tokenized_words,all_words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(all_words))
    for idx,w in enumerate(all_words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['how','hi'])

array([0., 1.])

In [8]:
data = pd.read_csv('./data.csv')

In [9]:
X = data['Abstract']
y = data['Title']

In [10]:
X[0]

'  Discourse structures are beneficial for various NLP tasks such as dialogue\nunderstanding, question answering, sentiment analysis, and so on. This paper\npresents a deep sequential model for parsing discourse dependency structures of\nmulti-party dialogues. The proposed model aims to construct a discourse\ndependency tree by predicting dependency relations and constructing the\ndiscourse structure jointly and alternately. It makes a sequential scan of the\nElementary Discourse Units (EDUs) in a dialogue. For each EDU, the model\ndecides to which previous EDU the current one should link and what the\ncorresponding relation type is. The predicted link and relation type are then\nused to build the discourse structure incrementally with a structured encoder.\nDuring link prediction and relation classification, the model utilizes not only\nlocal information that represents the concerned EDUs, but also global\ninformation that encodes the EDU sequence and the discourse structure that is\n

In [11]:
y[0]

'A Deep Sequential Model for Discourse Parsing on Multi-Party Dialogues'

In [12]:
all_words_X = []
all_words_y = []
data = []

In [13]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    y_batch = tokenize(y_batch)
    new_X = []
    new_y = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    for yb in y_batch:
        new_y.append(stem(yb))
    all_words_X.extend(new_X)
    all_words_y.extend(new_y)
    data.append([new_X,new_y])

In [14]:
X = []
y = []

In [15]:
all_words_X = sorted(set(all_words_X))
all_words_y = sorted(set(all_words_y))

In [16]:
for Xb,yb in tqdm(data):
    X.append(bag_of_words(Xb,all_words_X))
    y.append(bag_of_words(yb,all_words_y))

In [17]:
all_words_X = sorted(set(all_words_X))
all_words_y = sorted(set(all_words_y))

In [18]:
def accuracy(model,X,y):
    correct = 0
    total = 0
    preds = model(X)
    for pred,y_batch in zip(preds,y):
        if pred == y_batch:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [19]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.activation = ReLU()
        self.iters = 10
        self.linear1 = Linear(len(all_words_X),1024)
        self.linear2 = Linear(1024,1024)
        self.output = Linear(1024,len(all_words_y))
    
    def forward(self,X):
        preds = self.activation(self.linear1(X))
        for _ in range(self.iters):
            preds = self.activation(self.linear2(preds))
        preds = self.output(preds)
        return preds

In [20]:
device = 'cuda'

In [21]:
model = Model().to(device)

In [22]:
criterion = MSELoss()

In [23]:
optimizer = Adam(model.parameters(),lr=0.001)

In [24]:
epochs = 1000
batch_size = 32

In [25]:
PROJECT_NAME = 'Arxiv-Papers-Abstract-to-Title'

In [26]:
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    for idx in range(0,len(X),batch_size):
        X_batch = X[idx:idx+batch_size].to(device).float()
        y_batch = y[idx:idx+batch_size].to(device).float()
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    wandb.log({'Loss':loss.item()})
    wandb.log({'Accuracy':accuracy(model,X,y)})
wandb.finish()

In [27]:
import wandb

In [28]:
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    for idx in range(0,len(X),batch_size):
        X_batch = X[idx:idx+batch_size].to(device).float()
        y_batch = y[idx:idx+batch_size].to(device).float()
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    wandb.log({'Loss':loss.item()})
    wandb.log({'Accuracy':accuracy(model,X,y)})
wandb.finish()

In [29]:
X = torch.from_numpy(np.array(X)).to(device).float()
y = torch.from_numpy(np.array(y)).to(device).float()

In [30]:
def accuracy(model,X,y):
    correct = 0
    total = 0
    preds = model(X)
    for pred,y_batch in zip(preds,y):
        if pred == y_batch:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [31]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.activation = ReLU()
        self.iters = 10
        self.linear1 = Linear(len(all_words_X),1024)
        self.linear2 = Linear(1024,1024)
        self.output = Linear(1024,len(all_words_y))
    
    def forward(self,X):
        preds = self.activation(self.linear1(X))
        for _ in range(self.iters):
            preds = self.activation(self.linear2(preds))
        preds = self.output(preds)
        return preds

In [32]:
device = 'cuda'

In [33]:
model = Model().to(device)

In [34]:
criterion = MSELoss()

In [35]:
optimizer = Adam(model.parameters(),lr=0.001)

In [36]:
epochs = 1000
batch_size = 32

In [37]:
PROJECT_NAME = 'Arxiv-Papers-Abstract-to-Title'

In [38]:
import wandb

In [39]:
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    for idx in range(0,len(X),batch_size):
        X_batch = X[idx:idx+batch_size].to(device).float()
        y_batch = y[idx:idx+batch_size].to(device).float()
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    wandb.log({'Loss':loss.item()})
    wandb.log({'Accuracy':accuracy(model,X,y)})
wandb.finish()