In [1]:
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()

In [2]:
def tokenize(word):
    return nltk.word_tokenize(word)

In [3]:
tokenize('how are you ? bro ')

['how', 'are', 'you', '?', 'bro']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('how'),stem('are'),stem('you'),stem('?'),stem('bro')

('how', 'are', 'you', '?', 'bro')

In [6]:
def bag_of_words(tokenized_word,all_words):
    tokenized_words = []
    for tokenized_w in tokenized_word:
        tokenized_words.append(stem(tokenized_w))
    bag = np.zeros(len(all_words),dtype=np.float32)
    for idx,w in enumerate(tokenized_words):
        if w in all_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['bro'],tokenize('how are you ? bro '))

array([1., 0., 0., 0., 0.], dtype=float32)

In [8]:
batch_size = 32
epochs = 250
device = 'cuda'
new_data = []
all_words = []
tags = []
ignore_words = ["?", "!", ".", "@", "#", "$", "%", "^", "&", "*"]
data = pd.read_csv('./data/train_data_cleaning.csv')
X = data['text'].tolist()
y = data['target'].tolist()

In [9]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    new_X = []
    for x_batch in X_batch:
        if x_batch in ignore_words:
            pass
        else:
            new_X.append(stem(x_batch))
        all_words.extend(new_X)
    tags.append(y_batch)
    new_data.append((new_X,y_batch))

7613it [00:02, 3191.32it/s]


In [10]:
tags = sorted(set(tags))

In [11]:
tags

[0, 1]

In [12]:
all_words = sorted(set(all_words))

In [13]:
np.random.shuffle(new_data)

In [14]:
X = []
y = []

In [None]:
for sent,tag in tqdm(new_data):
    bag = bag_of_words(sent,all_words)
    X.append(bag)
    label = tags.index(tag)
    y.append(label)

 40%|████      | 3061/7613 [00:04<00:07, 615.74it/s]

In [None]:
X[:5]

In [None]:
y[:5]

In [None]:
len(all_words)

In [None]:
from sklearn.model_selection import *

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.125,shuffle=False)

In [None]:
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).float()

In [None]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds.view(-1,1),y.view(-1,1))
    return loss.item()

In [None]:
def get_accuracy(model,X,y):
    preds = model(X)
    correct = -1
    total = -1
    for pred,y_batch in zip(preds,y):
        pred = int(torch.round(pred))
        y_batch = int(torch.round(y_batch))
        if pred == y_batch:
            correct += 1
        total += 1
    acc = round(correct/total,3)
    acc = acc*100
    return acc

In [None]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.activation = ReLU()
        self.linear1 = Linear(len(all_words),128)
        self.linear2 = Linear(128,256)
        self.linear3 = Linear(256,512)
        self.linear4 = Linear(512,1024)
        self.linear5 = Linear(1024,1)
        self.output_activation = Sigmoid()
    
    def forward(self,X):
        preds = self.activation(self.linear1(X))
        preds = self.activation(self.linear2(preds))
        preds = self.activation(self.linear3(preds))
        preds = self.activation(self.linear4(preds))
        preds = self.output_activation(self.linear5(preds))
        return preds

In [None]:
model = Model().to(device)

In [None]:
criterion = MSELoss()

In [None]:
optimizer = Adam(model.parameters(),lr=0.001)

In [None]:
PROJECT_NAME = 'NLP-with-Disaster-Tweets-Clf'

In [None]:
import wandb

In [None]:
wandb.init(project=PROJECT_NAME,name=f'baseline-{criterion}-no-backward-pass')
wandb.watch(model)
for _ in tqdm(range(epochs)):
    for idx in range(0,len(X_train),batch_size):
        X_batch = X_train[idx:idx+batch_size].to(device).float()
        y_batch = y_train[idx:idx+batch_size].to(device).float()
        preds = model(X_batch)
        preds = preds.to(device)
        loss = criterion(preds.view(-1,1),y_batch.view(-1,1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    wandb.log({'Loss':get_loss(model,X_train,y_train,criterion)})
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    wandb.log({'Acc':get_accuracy(model,X_train,y_train)})
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
wandb.watch(model)
wandb.finish()

In [None]:
torch.round(preds)

In [None]:
y_batch

In [None]:
X_batch