In [1]:
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence.lower())

In [3]:
tokenize('hi how $1000')

['hi', 'how', '$', '1000']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('organic')

'organ'

In [6]:
def bag_of_words(tokenized_words,all_words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(all_words))
    for idx,w in enumerate(all_words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['hi','how'])

array([1., 0.])

In [8]:
data = pd.read_csv('./cleaned-data.csv')

In [9]:
data = data[['Text','Score']]

In [10]:
# data.to_csv('./cleaned-data.csv',index=False)

In [11]:
X = data['Text']
y = data['Score']

In [12]:
all_words = []
all_data = []
tags = []

In [None]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    new_X = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    all_words.extend(new_X)
    all_data.append([new_X,y_batch])
    tags.append(y_batch)

1469it [00:01, 813.98it/s]

In [None]:
np.random.shuffle(all_words)

In [None]:
np.random.shuffle(all_data)

In [None]:
all_words = sorted(set(all_words))
tags = sorted(set(tags))

In [None]:
np.random.shuffle(all_words)

In [None]:
np.random.shuffle(all_data)

In [None]:
X = []
y = []

In [None]:
len(tags)

In [None]:
np.eye(4,5)[3]

In [None]:
# np.eye(tags.index(tag),len(tags))

In [None]:
for sentence,tag in tqdm(all_data):
    X.append(bag_of_words(sentence,all_words))
    y.append(np.eye(tags.index(tag)+1,len(tags))[tags.index(tag)-1])

In [None]:
X[0]

In [None]:
y[0]

In [None]:
np.eye(tags.index(tag)+1,len(tags))[tags.index(tag)-1]

In [None]:
tags.index(tag)

In [None]:
tags

In [None]:
from sklearn.model_selection import *

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [None]:
device = 'cuda'

In [None]:
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).to(device).float()

In [None]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds,y)
    return loss.item()

In [None]:
def get_accuracy(model,X,y):
    total = 0
    correct = 0
    preds = model(X)
    for pred,y_batch in zip(preds,y):
        pred = int(torch.argmax(pred))
        y_batch = int(torch.argmax(y_batch))
        if pred == y_batch:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [None]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.iters = 12
        self.activation = ReLU()
        self.linear1 = Linear(len(all_words),512)
        self.linaer2 = Linear(512,512)
        self.bc = BatchNorm1d(512)
        self.output = Linear(512,len(tags))
    
    def forward(self,X):
        preds = self.activation(self.linear1(X))
        for _ in range(self.iters):
            preds = self.activation(self.bc(self.linaer2(preds)))
        preds = self.output(preds)
        return preds

In [None]:
device = 'cuda'

In [None]:
model = Model().to(device)
criterion = MSELoss()
optimizer = Adam(model.parameters(),lr=0.001)
batch_size = 32
epochs = 100

In [None]:
PROJECT_NAME = 'NLP-Topic-Modelling'

In [None]:
import wandb

In [None]:
# wandb.init(project=PROJECT_NAME,name='baseline')
# for _ in tqdm(range(epochs)):
#     for idx in range(0,len(X_train),batch_size):
#         X_batch = X_train[idx:idx+batch_size].to(device).float()
#         y_batch = y_train[idx:idx+batch_size].to(device).float()
#         preds = model(X_batch)
#         loss = criterion(preds,y_batch)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#     torch.cuda.empty_cache()
#     wandb.log({'Loss':(get_loss(model,X_train,y_train,criterion)+get_loss(model,X_batch,y_batch,criterion)/2)})
#     wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
#     wandb.log({'Acc':(get_accuracy(model,X_train,y_train)+get_accuracy(model,X_batch,y_batch))/2})
#     wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
# wandb.finish()

In [None]:
torch.save(model,'model.pt')
torch.save(model,'model.pth')

In [None]:
torch.save(model.state_dict(),'model-sd.pt')
torch.save(model.state_dict(),'model-sd.pth')