In [1]:
import wandb
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()
PROJECT_NAME = 'NLP-with-Disaster-Tweets-cleaning-data-V3'
device = 'cuda'

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence.lower())

In [3]:
tokenize("@100")

['@', '100']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('organic')

'organ'

In [6]:
def bag_of_words(tokenized_words,words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(words))
    for idx,w in enumerate(words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['hi','how','hi'])

array([1., 0., 1.])

In [8]:
data = pd.read_csv('./data.csv').sample(frac=1)

In [9]:
X = data['text']

In [10]:
data

Unnamed: 0,id,keyword,location,text,target
2464,3532,derailment,Mumbai,Mumbai24x7 Helping Hand : In Mumbai 2 TTEs ta...,1
3726,5297,fear,New York. NY,my biggest fear is that eventually you will se...,0
4575,6505,injuries,,@ nalathekoala As a health care professional ...,0
2661,3822,detonate,Brasil,. @ no_periferico Apollo Brown - ' Detona...,0
3713,5281,fear,Stanford University,Help me win $ by having the most shares on m...,0
...,...,...,...,...,...
1144,1648,bombing,,Japan Marks 70th Anniversary of Hiroshima Atom...,1
4640,6597,inundated,The Main,I presume my timeline will be inundated with ...,0
6132,8750,siren,,Lol ' @ j2bone : * cousin ' @ Foxy_Sire...,0
347,498,army,,One Direction Is my pick for Fan Army # Dire...,0


In [11]:
y = data['target']

In [12]:
words = []
data = []
labels = {}
labels_r = {}
idx = 0

In [13]:
for label in y.tolist():
    if label not in list(labels.keys()):
        idx += 1
        labels[label] = idx
        labels_r[idx] = label

In [14]:
labels

{1: 1, 0: 2}

In [15]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    new_X = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    words.extend(new_X)
    data.append([
        new_X,
        np.eye(labels[y_batch],len(labels))[labels[y_batch]-1]
    ])

7613it [00:02, 3291.53it/s]


In [16]:
words = sorted(set(words))

In [17]:
np.random.shuffle(data)

In [18]:
X = []
y = []

In [19]:
for sentence,tag in tqdm(data):
    X.append(bag_of_words(sentence,words))
    y.append(tag)

100%|██████████| 7613/7613 [00:20<00:00, 368.21it/s]


In [20]:
X[590]

array([0., 0., 0., ..., 0., 0., 0.])

In [21]:
y[0]

array([0., 1.])

In [22]:
from sklearn.model_selection import *
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.125,shuffle=False)
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).float()

In [23]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds,y)
    return loss.item()

In [24]:
def get_accuracy(model,X,y):
    preds = model(X)
    correct = 0
    total = 0
    for pred,yb in zip(preds,y):
        pred = int(torch.argmax(pred))
        yb = int(torch.argmax(yb))
        if pred == yb:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [33]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.hidden = 1024
        self.activation = ReLU()
        self.linear1 = Linear(len(words),self.hidden)
        self.linear2 = Linear(self.hidden,self.hidden)
        self.linear3 = Linear(self.hidden,len(labels))
    
    def forward(self,X):
        preds = self.linear1(X)
        preds = self.linear2(preds)
        print(preds[0:5])
        preds = self.activation(preds)
        print(preds[0:5])
        preds = self.linear3(preds)
        return preds

In [34]:
model = Model().to(device)
criterion = MSELoss()
optimizer = Adam(model.parameters(),lr=0.001)
epochs = 100
batch_size = 32

In [34]:
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    for i in range(0,len(X_train),batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    torch.cuda.empty_cache()
    wandb.log({'Loss':(get_loss(model,X_train,y_train,criterion)+get_loss(model,X_batch,y_batch,criterion)/2)})
    torch.cuda.empty_cache()
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Acc':(get_accuracy(model,X_train,y_train)+get_accuracy(model,X_batch,y_batch))/2})
    torch.cuda.empty_cache()
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
    torch.cuda.empty_cache()
    model.train()
wandb.finish()

[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  0%|          | 0/100 [00:00<?, ?it/s]

tensor([[ 0.0186, -0.0202,  0.0169,  ..., -0.0113,  0.0170,  0.0334],
        [ 0.0148, -0.0333, -0.0196,  ..., -0.0159,  0.0110,  0.0250],
        [ 0.0142, -0.0241,  0.0073,  ...,  0.0012,  0.0065,  0.0637],
        [ 0.0391, -0.0097, -0.0113,  ..., -0.0065,  0.0039,  0.0221],
        [ 0.0180, -0.0127, -0.0013,  ...,  0.0013,  0.0140,  0.0418]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0.0186, 0.0000, 0.0169,  ..., 0.0000, 0.0170, 0.0334],
        [0.0148, 0.0000, 0.0000,  ..., 0.0000, 0.0110, 0.0250],
        [0.0142, 0.0000, 0.0073,  ..., 0.0012, 0.0065, 0.0637],
        [0.0391, 0.0000, 0.0000,  ..., 0.0000, 0.0039, 0.0221],
        [0.0180, 0.0000, 0.0000,  ..., 0.0013, 0.0140, 0.0418]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[ 0.0214, -0.0144, -0.0032,  ...,  0.0054,  0.0150,  0.0289],
        [ 0.0537, -0.0286,  0.0175,  ..., -0.0243,  0.0167,  0.0104],
        [ 0.0539, -0.0240,  0.0234,  ..., -0.0025,  0.0223,  0.0340],
        [ 0.0443, -0

  1%|          | 1/100 [00:02<04:13,  2.56s/it]

tensor([[ 2.4820e-01,  1.1769e-01,  8.8290e-02,  ..., -3.9027e-01,
          2.1660e-02, -2.3690e-01],
        [ 1.5009e-01, -1.6044e-01,  4.4350e-04,  ..., -6.1833e-01,
          3.4595e-03, -3.7076e-01],
        [ 2.0999e-01, -3.4820e-02, -2.4582e-02,  ..., -4.1214e-01,
          2.4402e-02, -2.5770e-01],
        [ 1.8222e-01, -1.1182e-01,  9.4255e-03,  ..., -3.0962e-01,
          8.8947e-02, -1.7762e-01],
        [ 1.0455e-01, -1.7640e-01,  5.5050e-03,  ..., -3.2783e-01,
          9.0197e-02, -1.6249e-01]], device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0.2482, 0.1177, 0.0883,  ..., 0.0000, 0.0217, 0.0000],
        [0.1501, 0.0000, 0.0004,  ..., 0.0000, 0.0035, 0.0000],
        [0.2100, 0.0000, 0.0000,  ..., 0.0000, 0.0244, 0.0000],
        [0.1822, 0.0000, 0.0094,  ..., 0.0000, 0.0889, 0.0000],
        [0.1045, 0.0000, 0.0055,  ..., 0.0000, 0.0902, 0.0000]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[ 0.3254,  0.0388,  0.0220,  ..., -0.5042,  0.0713, -0.3411],
  

  2%|▏         | 2/100 [00:05<04:09,  2.55s/it]

tensor([[ 0.3939, -1.2553, -0.5103,  ..., -0.7217,  0.0370, -0.0833],
        [ 0.2596, -1.7337,  0.1044,  ..., -1.4164, -0.5026, -0.1877],
        [ 0.3677, -1.5951, -0.5446,  ..., -0.9677, -0.0821, -0.3244],
        [ 0.1538, -0.0938,  0.0490,  ..., -0.0942,  0.2876, -0.3570],
        [ 0.1597, -0.8044,  0.6007,  ..., -0.5302,  0.0974, -0.4772]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0.3939, 0.0000, 0.0000,  ..., 0.0000, 0.0370, 0.0000],
        [0.2596, 0.0000, 0.1044,  ..., 0.0000, 0.0000, 0.0000],
        [0.3677, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1538, 0.0000, 0.0490,  ..., 0.0000, 0.2876, 0.0000],
        [0.1597, 0.0000, 0.6007,  ..., 0.0000, 0.0974, 0.0000]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[ 0.4275, -1.5061, -0.3267,  ..., -0.9611, -0.0211, -0.2793],
        [ 0.2671, -1.3980, -0.1355,  ..., -1.0389, -0.0746, -0.2899],
        [ 0.2282, -0.6124,  0.0522,  ..., -0.6195,  0.0595, -0.2570],
        [ 0.0762, -0

  3%|▎         | 3/100 [00:07<04:04,  2.52s/it]

tensor([[ 0.0790, -1.2724, -1.7570,  ..., -0.9011, -0.3282, -0.8802],
        [ 0.4549, -1.2814, -0.6999,  ..., -1.2892, -0.2336, -0.6268],
        [ 0.3358, -1.1201, -1.3190,  ..., -0.8913,  0.1026, -0.5003],
        [ 0.0868, -0.3177, -0.3953,  ..., -0.3371, -0.1887, -0.0978],
        [ 0.2594, -1.3852, -0.8787,  ..., -0.9661, -0.0698, -0.2928]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0.0790, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.4549, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3358, 0.0000, 0.0000,  ..., 0.0000, 0.1026, 0.0000],
        [0.0868, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2594, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[ 0.3632, -1.0179, -1.6292,  ..., -1.0820, -0.2638, -0.6648],
        [ 0.2632, -0.6798, -1.0448,  ..., -1.1761, -0.1828, -0.5572],
        [ 0.3011, -0.5715, -0.7829,  ..., -0.9112,  0.0407, -0.3774],
        [ 0.2151, -0

  4%|▍         | 4/100 [00:09<03:57,  2.47s/it]

tensor([[-0.7676, -0.7163, -1.6095,  ..., -1.1363, -1.3689, -1.8315],
        [ 0.0574, -2.1768, -1.2748,  ..., -0.9963, -0.8770, -1.1814],
        [ 0.1129, -1.0795, -1.0889,  ..., -0.8245, -0.3309, -0.9333],
        [ 0.1222, -0.4475, -0.7540,  ..., -0.9027, -0.6403, -1.5805],
        [ 0.2304, -1.5212, -1.2179,  ..., -1.4009, -0.8859, -1.6864]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0574, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1129, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1222, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2304, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[-0.1894, -1.6752, -2.1909,  ..., -1.3988, -1.1073, -2.1320],
        [ 0.0492, -0.8714, -1.4257,  ..., -1.0765, -0.4671, -1.6432],
        [ 0.2479, -1.0313, -1.0707,  ..., -1.1015, -0.4063, -1.0217],
        [ 0.3849, -0

  5%|▌         | 5/100 [00:12<03:57,  2.50s/it]

tensor([[-1.0346, -0.4737, -0.6918,  ..., -1.2739, -1.3539, -1.6163],
        [-0.1289, -1.2583, -0.2027,  ..., -0.6161, -0.2789, -0.2433],
        [-0.0314, -1.2595, -1.1129,  ..., -1.1741, -0.0842, -0.9742],
        [-0.3887, -0.3142, -0.7899,  ..., -1.0635, -0.4772, -2.1486],
        [-0.2023, -0.7255, -1.2147,  ..., -1.3717, -0.9647, -2.6062]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<SliceBackward>)
tensor([[-0.5527, -1.6780, -2.2852,  ..., -1.7519, -0.8708, -2.6625],
        [-0.4230, -0.9219, -1.6436,  ..., -1.4040, -0.2954, -1.9340],
        [ 0.1560, -0.8558, -1.0233,  ..., -1.0854, -0.2753, -1.3161],
        [ 0.4322, -0.5571, -0.7326,  ..., -0.8094, -0.0605, -1.1222],
        [ 0.0216, -1.2765, -1.0931,  ..., -1.3227, -1.2833, -2.4737]],

  5%|▌         | 5/100 [00:14<04:34,  2.89s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



tensor([[-1.0264, -2.4408, -1.8271,  ..., -1.4514, -1.7969, -2.7245],
        [ 0.3592, -1.7590, -1.8547,  ..., -1.5874, -0.5175, -1.9418],
        [-0.5528, -1.1953, -0.5369,  ..., -0.5805, -0.8463, -2.0689],
        [ 0.6167, -0.6244, -0.3255,  ..., -0.4856,  0.0379, -0.2564],
        [ 0.6028, -1.7608, -1.1353,  ..., -1.3753, -0.3071, -1.4173]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3592, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.6167, 0.0000, 0.0000,  ..., 0.0000, 0.0379, 0.0000],
        [0.6028, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[ 0.5244, -0.7417, -0.3291,  ..., -0.7971, -0.1471, -0.9170],
        [-0.1233, -1.1444, -1.1123,  ..., -1.1598, -0.8700, -1.6022],
        [-0.1303, -1.4499, -1.1065,  ..., -1.0746, -0.9518, -2.0422],
        [-1.0272, -2

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-35-15c1e5c8ae99>", line 6, in <module>
    preds = model(X_batch)
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "<ipython-input-33-d6e8baad6a6a>", line 13, in forward
    print(preds[0:5])
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/torch/_tensor.py", line 203, in __repr__
    return torch._tensor_str._str(self)
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/torch/_tensor_str.py", line 406, in _str
    return _str_intern(self)
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/torch/_tensor_str.py", line 381, in _str_intern
    tensor_str = _tensor_str(self, indent)
  File "/media/in

TypeError: object of type 'NoneType' has no len()

Exception in thread Thread-14:
Traceback (most recent call last):
  File "/media/indika/Sync/anaconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/media/indika/Sync/anaconda3/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 167, in check_status
Exception in thread     Thread-15:
Traceback (most recent call last):
  File "/media/indika/Sync/anaconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
status_response = self._interface.communicate_stop_status()
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 114, in communicate_stop_status
    resp = self._communicate_stop_status(status)
  File "/media/indika/Sync/anaconda3/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 968, in _communicate_stop_status
    self.run()
  File "/medi

In [28]:
torch.save(model,'model.pt')
torch.save(model,'model.pth')
torch.save(model.state_dict(),'model-sd.pt')
torch.save(model.state_dict(),'model-sd.pth')
torch.save(X,'X.pt')
torch.save(X,'X.pth')
torch.save(y,'y.pt')
torch.save(y,'y.pth')

In [29]:
torch.save(X_train,'X_train.pt')
torch.save(X_test,'X_test.pth')
torch.save(y_train,'y_train.pt')
torch.save(y_test,'y_test.pth')

In [30]:
torch.save(words,'words.pt')
torch.save(words,'words.pth')
torch.save(data,'data.pt')
torch.save(data,'data.pth')
torch.save(labels,'labels.pt')
torch.save(labels,'labels.pth')

In [31]:
torch.save(idx,'idx.pt')
torch.save(idx,'idx.pth')