In [1]:
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

In [3]:
tokenize('$100')

['$', '100']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('organic')

'organ'

In [6]:
def bag_of_words(tokenized_words,all_words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(all_words))
    for idx,w in enumerate(all_words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['how','hi'])

array([0., 1.])

In [8]:
data = pd.read_csv('./data.csv')

In [9]:
data = data[:1000]

In [10]:
X = data['Text']
y = data['Summary']

In [11]:
X_words = []
data = []
y_words = []

In [12]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    y_batch = tokenize(y_batch)
    new_X = []
    new_y = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    for yb in y_batch:
        new_y.append(stem(yb))
    X_words.extend(new_X)
    y_words.extend(new_y)
    data.append([new_X,new_y])

In [13]:
X_words = sorted(set(X_words))
y_words = sorted(set(y_words))

In [14]:
np.random.shuffle(data)

In [15]:
X = []
y = []

In [16]:
for X_batch,y_batch in tqdm(data):
    X.append(bag_of_words(X_batch,X_words))
    y.append(bag_of_words(y_batch,y_words))

In [17]:
from sklearn.model_selection import * 

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)

In [19]:
device = 'cuda'

In [20]:
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).float()

In [21]:
# torch.save(X_train,'X_train.pt')
# torch.save(X_test,'X_test.pth')
# torch.save(y_train,'y_train.pt')
# torch.save(y_test,'y_test.pth')
# torch.save(X,'X.pt')
# torch.save(X,'X.pth')
# torch.save(y,'y.pt')
# torch.save(y,'y.pth')

In [22]:
# torch.save(X_words,'X_words.pt')
# torch.save(X_words,'X_words.pth')
# torch.save(data,'data.pt')
# torch.save(data,'data.pth')
# torch.save(y_words,'y_words.pt')
# torch.save(y_words,'y_words.pth')

In [23]:
def get_accuracy(model,X,y):
    accs = []
    preds = model(X)
    correct = 0
    total = 0
    for pred,yb in zip(preds,y):
        for pred_in_pred,yb_in_yb in zip(pred,yb):
            pred_in_pred = int(torch.argmax(pred_in_pred))
            yb_in_yb = int(torch.argmax(yb_in_yb))
            if pred_in_pred == yb_in_yb:
                correct += 1
            total += 1
        acc = round(correct/total,3)*100
        accs.append(acc)
    print(accs)
    print(yb_in_yb)
    print(pred_in_pred)
    acc = np.mean(accs)
    return acc

In [24]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds,y)
    return loss.item()

In [25]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.activation = ReLU()
        self.iters = 10
        self.linear1 = Linear(len(X_words),256)
        self.linear2 = Linear(256,256)
        self.linear2bn = BatchNorm1d(256)
        self.output = Linear(256,len(y_words))
    
    def forward(self,X):
        preds = self.linear1(X)
        for _ in range(self.iters):
            preds = self.activation(self.linear2bn(self.linear2(preds)))
        preds = self.output(preds)
        return preds

In [26]:
model = Model().to(device)
criterion = MSELoss()
optimizer = Adam(model.parameters(),lr=0.001)
batch_size = 32
epochs = 1

In [27]:
import wandb
PROJECT_NAME = 'Summarize-Text-Review'

In [28]:
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    for i in range(0,len(X_train),batch_size):
        X_batch = X_train[i:i+batch_size].to(device)
        y_batch = y_train[i:i+batch_size].to(device)
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    torch.cuda.empty_cache()
    wandb.log({'Loss':get_loss(model,X_train,y_train,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Acc':get_accuracy(model,X_train,y_train)})
    torch.cuda.empty_cache()
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
    torch.cuda.empty_cache()
    model.train()
wandb.finish()