In [1]:
import wandb
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()
PROJECT_NAME = 'NLP-with-Disaster-Tweets-V2'
device = 'cuda'

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence.lower())

In [3]:
tokenize('$100')

['$', '100']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('organic')

'organ'

In [6]:
def bag_of_words(tokenized_words,words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(words))
    for idx,w in enumerate(words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['how','hi','how'])

array([0., 1., 0.])

In [8]:
data = pd.read_csv('./data.csv').sample(frac=1)

In [9]:
data

Unnamed: 0,id,keyword,location,text,target
514,740,attacked,"SÌ£o Paulo SP, Brasil",Christian Attacked by Muslims at the Temple Mo...,1
250,355,annihilation,,U . S National Park Services Tonto National Fo...,0
4460,6343,hostages,NYC metro,Holmgren describing 96 World Cup : we were Lo...,0
1199,1725,buildings%20burning,NJ,@ themagickidraps not upset with a rally upse...,1
5073,7231,natural%20disaster,,Top insurer blasts lack of Australian Govt act...,1
...,...,...,...,...,...
5066,7223,natural%20disaster,America of Founding Fathers,This is the natural and unavoidable consequenc...,1
1615,2333,collapse,In the clouds...,@ BehindAShield @ Wars_Goddess Sweet Lord . ...,0
7548,10789,wrecked,Pennsylvania,Four hundred wrecked cars ( costing $ 100 ap...,0
2436,3499,derailed,Toronto,So derailed_benchmark is cool for paths . i w...,0


In [10]:
X = data['text']
y = data['target']
words = []
data = []
idx = 0
labels = {}
labels_r = {}

In [15]:
for label in tqdm(y):
    if label not in list(labels.keys()):
        idx += 1
        labels[label] = idx
        labels_r[idx] = label

100%|██████████| 7613/7613 [00:00<00:00, 1005834.95it/s]


In [16]:
y.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [17]:
labels

{1: 1, 0: 2}

In [18]:
labels_r

{1: 1, 2: 0}

In [19]:
for X_batch,y_batch in zip(tqdm(X),y):
    X_batch = tokenize(X_batch)
    new_X = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    words.extend(new_X)
    data.append([
        new_X,
        np.eye(labels[y_batch],len(labels))[labels[y_batch]-1]
    ])

100%|██████████| 7613/7613 [00:02<00:00, 3248.75it/s]


In [20]:
words = sorted(set(words))
np.random.shuffle(data)

In [21]:
X = []
y = []

In [22]:
for d in tqdm(data):
    X.append(bag_of_words(d[0],words))
    y.append(d[1])

100%|██████████| 7613/7613 [00:21<00:00, 352.17it/s]


In [23]:
d[0],d[1]

(['fire',
  'truck',
  'and',
  'ambul',
  'in',
  'k3',
  'phase',
  '3',
  '.',
  'hope',
  'everyon',
  "'",
  's',
  'okay',
  '.',
  '#',
  'prayforsaipan'],
 array([1., 0.]))

In [24]:
from sklearn.model_selection import *
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.125,shuffle=False)
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).float()

In [25]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds,y)
    return loss.item()

In [26]:
def get_accuracy(model,X,y):
    preds = model(X)
    correct = 0
    total = 0
    for pred,yb in zip(preds,y):
        pred = int(torch.argmax(pred))
        yb = int(torch.argmax(yb))
        if pred == yb:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [27]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.hidden = 256
        self.activation = ReLU()
        self.input = Linear(len(words),self.hidden)
        self.l1 = Linear(self.hidden,self.hidden)
        self.l2 = Linear(self.hidden,self.hidden)
        self.l3 = Linear(self.hidden,self.hidden)
        self.l4 = Linear(self.hidden,self.hidden)
        self.l5 = Linear(self.hidden,self.hidden)
        self.output = Linear(self.hidden,len(labels))
    
    def forward(self,X):
        preds = self.input(X)
        preds = self.activation(self.l1(preds))
        preds = self.activation(self.l2(preds))
        preds = self.activation(self.l3(preds))
        preds = self.activation(self.l4(preds))
        preds = self.activation(self.l5(preds))
        preds = self.output(preds)
        return preds

In [28]:
model = Model().to(device)
criterion = MSELoss()
optimizer = Adam(model.parameters(),lr=0.001)
epochs = 100
batch_size = 32

In [None]:
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    for i in range(0,len(X_train),batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    torch.cuda.empty_cache()
    wandb.log({'Loss':(get_loss(model,X_train,y_train,criterion)+get_loss(model,X_batch,y_batch,criterion)/2)})
    torch.cuda.empty_cache()
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Acc':(get_accuracy(model,X_train,y_train)+get_accuracy(model,X_batch,y_batch))/2})
    torch.cuda.empty_cache()
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
    torch.cuda.empty_cache()
    model.train()
wandb.finish()
torch.cuda.empty_cache()

[34m[1mwandb[0m: Currently logged in as: [33mranuga-d[0m (use `wandb login --relogin` to force relogin)


  9%|▉         | 9/100 [00:07<01:08,  1.33it/s]

In [None]:
torch.save(model,'model.pt')
torch.save(model,'model.pth')
torch.save(model.state_dict(),'model-sd.pt')
torch.save(model.state_dict(),'model-sd.pth')
torch.save(words,'words.pt')
torch.save(words,'words.pth')
torch.save(data,'data.pt')
torch.save(data,'data.pth')
torch.save(labels,'labels.pt')
torch.save(labels,'labels.pth')
torch.save(idx,'idx.pt')
torch.save(idx,'idx.pth')
torch.save(y_train,'y_train.pt')
torch.save(y_test,'y_test.pth')
torch.save(y,'y.pt')
torch.save(y,'y.pth')