In [210]:
import pandas as pd
import numpy as np
import os
import torch
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
import spacy
from spacy.tokenizer import Tokenizer
from torchtext.data import Field,TabularDataset,BucketIterator
import time
from torch.nn.utils.rnn import pack_padded_sequence
from sklearn.model_selection import train_test_split

In [95]:
df=pd.read_csv("nlp-getting-started/real_train.csv")
df.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1


In [6]:
# df.drop(["Unnamed: 0","id"],axis=1,inplace=True)

In [96]:
# df.to_csv("nlp-getting-started/real_train.csv",index=False)

In [97]:
# df.drop(["Unnamed: 0"],inplace=True,axis=1)

In [98]:
df.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1


In [99]:
nlp=spacy.load("en_core_web_sm")

tokenizer=Tokenizer(nlp.vocab)

In [100]:
def spacy_tokenizer(x):
    return [tok.text for tok in tokenizer(x)]

In [101]:
TEXT=Field(sequential=False,use_vocab=False,batch_first=True,dtype=torch.float)
LABEL=Field(tokenize=spacy_tokenizer,lower=True,include_lengths=True,batch_first=True)

In [102]:
fields=[("index",LABEL),("text",TEXT),("targets",LABEL)]

{'text': '3', 'targets': ['13000', 'people', 'receive', 'wildfires', 'evacuation', 'orders', 'in', 'california']}


In [103]:
train_df,valid_df=train_test_split(df)

In [70]:
TEXT2 = data.Field(tokenize = spacy_tokenizer, include_lengths = True)
LABEL2 = data.LabelField(dtype = torch.float)

In [91]:
fields2 = [('text',TEXT2), ('label',LABEL2)]



In [104]:
class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)
# We will first create

In [109]:
train_ds, val_ds = DataFrameDataset.splits(fields2, train_df=train_df, val_df=valid_df)

In [116]:
print(vars(train_ds[13]))

print(type(train_ds[15]))

{'text': ['join', 'charity', '10k', 'run', 'event', 'doningtondash', '11am', 'start', 'sun', '20', 'sept', '2015', 'castle', 'donington', 'community', 'first', 'responders'], 'label': 0}
<class 'torchtext.data.example.Example'>


In [121]:
training_data=TabularDataset(path="nlp-getting-started/real_train.csv",format="csv",fields=fields2,skip_header=False)

print(vars(training_data.examples[3]))

{'text': ['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected'], 'label': '1'}


In [123]:
for i in range(3):
    print(vars(training_data.examples[i+1]))

{'text': ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all'], 'label': '1'}
{'text': ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada'], 'label': '1'}
{'text': ['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected'], 'label': '1'}


In [124]:
df2=pd.read_csv("nlp-getting-started/real_train.csv")
df2.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1


In [125]:
TEXT3=Field(sequential=spacy_tokenizer,include_lengths=True)
LABEL3=data.LabelField(dtype=torch.float)

In [127]:
fields3=[("text",TEXT3),("label",LABEL3)]

In [132]:
training_data3=TabularDataset(path="nlp-getting-started/real_train.csv",format="csv",fields=fields3,skip_header=False)

print(vars(training_data3.examples[4]))

{'text': ['13000', 'people', 'receive', 'wildfires', 'evacuation', 'orders', 'in', 'california'], 'label': '1'}


In [153]:
MAX_VOCAB_SIZE = 25000

TEXT2.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.200d',
                 unk_init = torch.Tensor.zero_)

In [154]:
LABEL2.build_vocab(train_ds)

In [155]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [198]:
train_iterator,valid_iterator=BucketIterator.splits((train_ds,val_ds),batch_size=128,
                                                   sort_within_batch=True,device=device)

In [305]:
len(train_iterator),len(valid_iterator)

(45, 15)

In [199]:
num_epochs=25
learning_rate=0.001

vocab_size=len(TEXT2.vocab)
Embedding_dim=200
hidden_dim=256
output_dim=1
n_layers=3
biderectional=True
dropout=0.2
PAD_IDX = TEXT2.vocab.stoi[TEXT2.pad_token]

In [200]:
class LSTM_net(nn.Module):
    def __init__(self,vocab_size,Embedding_dim,hidden_dim,output_dim,n_layers,biderectional,dropout,PAD_IDX):
        super(LSTM_net,self).__init__()
        self.embedding=nn.Embedding(vocab_size,Embedding_dim,padding_idx=PAD_IDX)
        self.lstm=nn.LSTM(Embedding_dim,hidden_dim,n_layers,bidirectional=biderectional,dropout=dropout)
        self.fc1=nn.Linear(hidden_dim*2,hidden_dim)
        self.fc2=nn.Linear(hidden_dim,1)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,text,text_length):
        # text = [sent len, batch size]
        embedding=self.embedding(text)
        # embedded = [sent len, batch size, emb dim]
        packed_embedded=pack_padded_sequence(embedding,text_length)
        
        packed_output,(hidden,cell)=self.lstm(packed_embedded)
        #unpack sequence
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output = [sent len, batch size, hid dim * num directions]
        # output over padding tokens are zero tensors
        
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]
        
        # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        # and apply dropout
        
        hidden=self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))
        output=self.fc1(hidden)
        output=self.dropout(self.fc2(output))
        
        return output
        
        

In [201]:
model=LSTM_net(vocab_size,Embedding_dim,hidden_dim,output_dim,n_layers,biderectional,dropout,PAD_IDX)
model

LSTM_net(
  (embedding): Embedding(15002, 200, padding_idx=1)
  (lstm): LSTM(200, 256, num_layers=3, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [202]:
pretrained_embeddings=TEXT.vocab.vectors

print(pretrained_embedding)
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [203]:
model.embedding.weight.data[PAD_IDX] = torch.zeros(input_dim)

print(model.embedding.weight.data)

RuntimeError: The expanded size of the tensor (200) must match the existing size (15002) at non-singleton dimension 0.  Target sizes: [200].  Tensor sizes: [15002]

In [217]:
criterion=nn.BCEWithLogitsLoss()

optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [218]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [219]:
# training function 
def train(model, iterator):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        text, text_lengths = batch.text
        
        optimizer.zero_grad()
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [220]:
def evaluate(model, iterator):
    
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_acc += acc.item()
        
    return epoch_acc / len(iterator)

In [221]:
t = time.time()
loss=[]
acc=[]
val_acc=[]

for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iterator)
    valid_acc = evaluate(model, valid_iterator)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)
    
print(f'time:{time.time()-t:.3f}')

	Train Loss: 0.474 | Train Acc: 75.90%
	 Val. Acc: 79.41%
	Train Loss: 0.329 | Train Acc: 84.61%
	 Val. Acc: 77.14%
	Train Loss: 0.254 | Train Acc: 87.47%
	 Val. Acc: 76.64%
	Train Loss: 0.234 | Train Acc: 88.57%
	 Val. Acc: 76.20%
	Train Loss: 0.210 | Train Acc: 88.73%
	 Val. Acc: 75.55%
	Train Loss: 0.186 | Train Acc: 90.62%
	 Val. Acc: 74.98%
	Train Loss: 0.191 | Train Acc: 89.70%
	 Val. Acc: 75.33%
	Train Loss: 0.179 | Train Acc: 90.16%
	 Val. Acc: 74.99%
	Train Loss: 0.178 | Train Acc: 90.54%
	 Val. Acc: 74.35%
	Train Loss: 0.167 | Train Acc: 90.65%
	 Val. Acc: 73.14%


KeyboardInterrupt: 

In [240]:
def accuracy(pred,y):
    ran=torch.rand(4)
    print("random ",ran)
    predction=torch.round(ran)
    print(predction)
    

In [241]:
print(accuracy(2,4))

random  tensor([0.4145, 0.0182, 0.8509, 0.4250])
tensor([0., 0., 1., 0.])
None


In [262]:
def accuracy(pred,y):
    prediction=torch.round(torch.sigmoid(pred))
    correct=(prediction==y).float()
    acc=correct.sum() / len(correct)
    return acc

In [271]:
def train(iteration):
    epoch_loss=0
    epoch_acc=0
    
    model.eval()
    
    for batch in iteration:
        text,text_length=batch.text
        
        optimizer.zero_grad()
        
        prediction=model(text,text_length).squeeze(1)
        
        loss=criterion(prediction,batch.label)
        
        acc=accuracy(prediction,batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss+=loss.item()
        
        epoch_acc+=acc.item()
        
    return epoch_loss / len(iteration) , epoch_acc / len(iteration)

In [272]:
def evalution(iteration):
    
    epoch_acc=0
    model.eval()
    
    with torch.no_grad():
        for batch in iteration:
            text,text_length=batch.text
            
            predcition=model(text,text_length).squeeze(1)
            
            acc=accuracy(predcition,batch.label)
            
            epoch_acc+=acc.item()
            
        return epoch_acc / len(iteration)

In [273]:
loss=[]
acc=[]
val_acc=[]

for i in range(20):
    train_loss,train_acc=train(train_iterator)
    valid_loss=evalution(valid_iterator)
    
    print(f"Train Loss {train_loss:.3f} | Train Acc {train_acc*100:.2f}%")
    print(f"Valid Loss {valid_loss:.2f}%")
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)

Train Loss 0.046 | Train Acc 98.39%
Valid Loss 0.76%
Train Loss 0.047 | Train Acc 98.24%
Valid Loss 0.75%
Train Loss 0.041 | Train Acc 98.49%
Valid Loss 0.75%
Train Loss 0.032 | Train Acc 98.54%
Valid Loss 0.73%
Train Loss 0.030 | Train Acc 98.75%
Valid Loss 0.74%
Train Loss 0.026 | Train Acc 98.63%
Valid Loss 0.75%
Train Loss 0.022 | Train Acc 98.75%
Valid Loss 0.75%
Train Loss 0.022 | Train Acc 98.80%
Valid Loss 0.74%
Train Loss 0.023 | Train Acc 98.66%
Valid Loss 0.74%
Train Loss 0.024 | Train Acc 98.68%
Valid Loss 0.72%
Train Loss 0.022 | Train Acc 98.59%
Valid Loss 0.73%
Train Loss 0.021 | Train Acc 98.80%
Valid Loss 0.72%
Train Loss 0.021 | Train Acc 98.84%
Valid Loss 0.71%
Train Loss 0.028 | Train Acc 98.72%
Valid Loss 0.73%
Train Loss 0.033 | Train Acc 98.49%
Valid Loss 0.75%
Train Loss 0.027 | Train Acc 98.59%
Valid Loss 0.73%
Train Loss 0.022 | Train Acc 98.82%
Valid Loss 0.74%
Train Loss 0.027 | Train Acc 98.54%
Valid Loss 0.72%
Train Loss 0.050 | Train Acc 98.17%
Valid Loss

In [274]:
torch.save(model.state_dict(), "nlp-getting-started/saved_weights.pt")

In [276]:
path="nlp-getting-started/saved_weights.pt"
model.load_state_dict(torch.load(path));
model.eval();

In [290]:
def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT2.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    
    prediction = model(tensor, length_tensor).squeeze(1)       #prediction 

    rounded_preds = torch.round(torch.sigmoid(prediction))
    predict_class = rounded_preds.tolist()[0]
    return predict_class        

In [291]:
df["text"][0]

'our deeds are the reason of this earthquake may allah forgive us all'

In [292]:
print(predict(model,'our deeds are the reason of this earthquake may allah forgive us all'))

RuntimeError: Expected `len(lengths)` to be equal to batch_size, but got 1 (batch_size=13)

In [300]:
def train(model,
          optimizer,
          criterion = nn.BCEWithLogitsLoss(),
          train_loader = train_iterator,
          valid_loader = valid_iterator,
          num_epochs = 5,
          eval_every = len(train_iterator) // 2,
          file_path = "nlp-getting-started/",
          best_valid_loss = float("Inf")):
    
    
    # initialize running values
    running_loss=0.0
    valid_running_loss=0.0
    global_step=0
    train_loss_list=[]
    valid_loss_list=[]
    global_loss_list=[]
    
    
    #training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in train_iterator:
#             labels=labels
#             titletext=titletext
#             titletext_len=titletext_len
            titletext,titletext_len=batch.text
            output=model(titletext,titletext_len).squeeze(1)
            
            loss=criterion(output,batch.label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # update running values
            running_loss+=loss.item()
            global_step+=1
            
            
            # evaluation step
            
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():
                    # validation loop
                    for batch in valid_loader:
#                             labels=labels
#                             titletext=titletext
#                             titletext_len=titletext_len
                            titletext,titletext_len=batch.text
                            output=model(titletext,titletext_len).squeeze(1)
                            
                            loss=criterion(output,batch.label)
                            valid_running_loss+=loss.item()
                            
                            
                average_train_loss=running_loss / eval_every
                average_valid_loss=valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_loss_list.append(global_step)
                
                
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                torch.save(model.state_dict(), 'nlp-getting-started/saved_weights2.pt')
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                            
                            
                            
                
            
            
            
            
    
    
train(model=model, optimizer=optimizer, num_epochs=10)

Epoch [1/10], Step [22/450], Train Loss: 0.1687, Valid Loss: 1.2051
Epoch [1/10], Step [44/450], Train Loss: 0.1618, Valid Loss: 1.2193
Epoch [2/10], Step [66/450], Train Loss: 0.1545, Valid Loss: 1.5192
Epoch [2/10], Step [88/450], Train Loss: 0.1653, Valid Loss: 1.2656
Epoch [3/10], Step [110/450], Train Loss: 0.1583, Valid Loss: 1.3968
Epoch [3/10], Step [132/450], Train Loss: 0.1536, Valid Loss: 1.5290
Epoch [4/10], Step [154/450], Train Loss: 0.1429, Valid Loss: 1.5995
Epoch [4/10], Step [176/450], Train Loss: 0.1555, Valid Loss: 1.5634
Epoch [5/10], Step [198/450], Train Loss: 0.1536, Valid Loss: 1.5412
Epoch [5/10], Step [220/450], Train Loss: 0.1661, Valid Loss: 1.4665
Epoch [6/10], Step [242/450], Train Loss: 0.1451, Valid Loss: 1.3817
Epoch [6/10], Step [264/450], Train Loss: 0.1577, Valid Loss: 1.6631
Epoch [7/10], Step [286/450], Train Loss: 0.1572, Valid Loss: 1.8010
Epoch [7/10], Step [308/450], Train Loss: 0.1654, Valid Loss: 1.7273
Epoch [8/10], Step [330/450], Train Lo

In [302]:
path='nlp-getting-started/saved_weights2.pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference 
import spacy
# nlp = spacy.load('e')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT2.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()             

In [304]:
print(predict(model,'our deeds are the reason of this earthquake may allah forgive us all'))

RuntimeError: Expected `len(lengths)` to be equal to batch_size, but got 1 (batch_size=13)

In [303]:
df["text"][0]

'our deeds are the reason of this earthquake may allah forgive us all'

In [296]:
for batch in train_iterator:
    text,text_length=batch.text
    print(text)
    break

tensor([[   17, 14266,    43,  ...,  6591,   118,   118],
        [   51,    22,  1320,  ..., 13722,   779,   779],
        [ 2501,  1170,  9049,  ...,    22,  1402,  1402],
        ...,
        [  518,   404,  1462,  ...,  8847,    72,    72],
        [   35,     3,  1892,  ...,  9004,   606,   606],
        [  198, 14296,   184,  ...,   473,   195,   195]])
