# Imports

In [1]:
!pip install torchtext
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import torchtext
import torch
from torch import nn,optim
from time import time

Defaulting to user installation because normal site-packages is not writeable


In [2]:
data = pd.read_csv("dataframe.csv")
print(data)



      Unnamed: 0                                               text  encoded
0              0  The soil I’m walking over comes    from deeper...    False
1              1  the close air of the earth whence she derived ...     True
2              2  Lyric night of the lingering Indian Summer,\nS...    False
3              3  “Percussus sum sicut foenum, et aruit cor meum...    False
4              4  I should be happy with my lot: A wife and moth...    False
...          ...                                                ...      ...
4789        4789  office has never really is its oldest daughter...     True
4790        4790  soot from her car with suitcases and hugged he...     True
4791        4791  Seeing in crowded restaurants the one you love...    False
4792        4792  flower sweet as I can never forsake And Fortun...     True
4793        4793  Echo that loved hid within a wood Would to her...    False

[4794 rows x 3 columns]


Creating tokenizer and vocabulary

In [3]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.nn.utils import rnn
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2)


tokenizer = get_tokenizer("basic_english")

def build_vocabulary(datasets):
    for dataset in datasets:
        #print(dataset)
        for text in dataset['text']:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocabulary([data,data]), min_freq=3, specials=["<UNK>"])

vocab.set_default_index(vocab["<UNK>"])

for l in data['text']:
  tokens = tokenizer(l)
  indexes = vocab(tokens)


Creating Dataloader

In [4]:

class Text(Dataset):
  def __init__(self,df):
    self.df = df
    #print(df)

  def pad(self,seq):
    if len(seq)>140:
      return seq[:140]
    else:
      for i in range(140-len(seq)):
        seq.insert(0,0)
      return seq

  def __len__(self):
    return len(self.df['encoded'])
  def __getitem__(self,idx):
    
    return torch.tensor(self.pad(vocab(tokenizer(self.df['text'][idx])))) , int(self.df['encoded'][idx])

test_loader  = DataLoader(Text(test.reset_index()) , batch_size=4)
train_loader = DataLoader(Text(train.reset_index()) , batch_size=128)

for txt , tgt in test_loader:
  _
  #print(txt)
  #print(txt)
voc = len(vocab)
#print(tokens)

print(vocab(["<UNK>"]))

[0]


In [5]:
print(test)
print(train)

      Unnamed: 0                                               text  encoded
257          257  She took the words for a stroll and the words ...    False
3516        3516  death. The dreadful sound of a bat s wing. Kil...     True
4772        4772  All those ;Liquid love affairs, Blind swimmers...    False
2330        2330  A rung’s come broken in the ladder to the mowa...    False
1185        1185  its secret recording of life. The window. And ...     True
...          ...                                                ...      ...
2808        2808  Across the bridge, where in the morning blow T...    False
395          395  ranting in the hot Highland fling their shotgu...     True
2335        2335  Know Celia, since thou art so proud, ; 'Twas I...    False
2328        2328  For Ethan Canin    I sat on the dock at dusk a...    False
3790        3790  When our two souls stand up erect and strong, ...    False

[959 rows x 3 columns]
      Unnamed: 0                                    

Creating RNN

In [6]:

class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.embedding_layer = nn.Embedding(len(vocab),64)
        self.rnn = nn.RNN(64,32, 3, batch_first=True)
        self.linear1 = nn.Linear(32,32)
        self.linear2 = nn.Linear(32,2)
        self.act = nn.ReLU()
        self.out = nn.LogSoftmax(dim = 1)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings)
        return self.out(self.linear2(self.act(self.linear1(self.act(output[:,-1])))))

model = RNN()

print(model)


RNN(
  (embedding_layer): Embedding(18658, 64)
  (rnn): RNN(64, 32, num_layers=3, batch_first=True)
  (linear1): Linear(in_features=32, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=2, bias=True)
  (act): ReLU()
  (out): LogSoftmax(dim=1)
)


In [7]:
from word2ket import EmbeddingKet, EmbeddingKetXS , ketify,summary
summary(model)
ketify(model,order = 8,rank = 4, use_EmbeddingKetXS= False)


Module Name                                                                           Total Parameters  Trainable Parameters # Elements in Trainable Parametrs       
Embedding(18658, 64)                                                                  1                 1                    1,194,112                               
RNN(64, 32, num_layers=3, batch_first=True)                                           12                12                   7,360                                   
Linear(in_features=32, out_features=32, bias=True)                                    2                 2                    1,056                                   
Linear(in_features=32, out_features=2, bias=True)                                     2                 2                    66                                      
ReLU()                                                                                0                 0                    0                                       
LogS

RNN(
  (embedding_layer): EmbeddingKet(18658, 64)
  (rnn): RNN(64, 32, num_layers=3, batch_first=True)
  (linear1): Linear(in_features=32, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=2, bias=True)
  (act): ReLU()
  (out): LogSoftmax(dim=1)
)

In [8]:
summary(model)

Module Name                                                                           Total Parameters  Trainable Parameters # Elements in Trainable Parametrs       
EmbeddingKet(18658, 64)                                                               1                 1                    1,194,112                               
RNN(64, 32, num_layers=3, batch_first=True)                                           12                12                   7,360                                   
Linear(in_features=32, out_features=32, bias=True)                                    2                 2                    1,056                                   
Linear(in_features=32, out_features=2, bias=True)                                     2                 2                    66                                      
ReLU()                                                                                0                 0                    0                                       
LogS

1202594

In [9]:
accuracy = []
optimizer = optim.SGD(model.parameters(), lr=0.017, momentum=0.9)
criterion = nn.NLLLoss()
time0 = time()

Training

In [10]:
epochs = 20
for e in range(epochs):
    running_loss = 0
    for text, tgt in train_loader:
        # Training pass
        optimizer.zero_grad()

        output = model(text)

        loss = criterion(output, tgt)

        #This is where the model learns by backpropagating
        loss.backward()
        #And optimizes its weights here
        optimizer.step()
        #print(loss.item())
        running_loss += loss.item()

    else:
        #print("\nTraining Time (in minutes) =",(time()-time0)/60)
        #print(output)
        print("\nEpoch {} - Training loss: {}".format(e+1, running_loss/len(test_loader)))
    correct_count, all_count = 0, 0
    for images,labels in test_loader:
      for i in range(len(labels)):
        img = images[i].view(1,-1)
        with torch.no_grad():
          logps = model(img)


        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        #print(pred_label)
        if(true_label == pred_label):
          correct_count += 1
        all_count += 1

  #print("Number Of Images Tested =", all_count)
    print("Model Accuracy =", (correct_count/all_count))



Epoch 1 - Training loss: 0.08669995839397113
Model Accuracy = 0.529718456725756

Epoch 2 - Training loss: 0.08665662904580435
Model Accuracy = 0.470281543274244

Epoch 3 - Training loss: 0.08664869219064712
Model Accuracy = 0.470281543274244

Epoch 4 - Training loss: 0.08664957011739413
Model Accuracy = 0.470281543274244

Epoch 5 - Training loss: 0.0866493433713913
Model Accuracy = 0.470281543274244

Epoch 6 - Training loss: 0.08664935901761055
Model Accuracy = 0.470281543274244

Epoch 7 - Training loss: 0.08664933070540429
Model Accuracy = 0.470281543274244

Epoch 8 - Training loss: 0.08664931133389472
Model Accuracy = 0.470281543274244

Epoch 9 - Training loss: 0.08664928625027339
Model Accuracy = 0.470281543274244

Epoch 10 - Training loss: 0.08664926290512084
Model Accuracy = 0.470281543274244

Epoch 11 - Training loss: 0.08664923310279846
Model Accuracy = 0.470281543274244

Epoch 12 - Training loss: 0.08664920727411905
Model Accuracy = 0.470281543274244

Epoch 13 - Training loss:

#Texts for testing:

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

Even my brother is not like to speak with me. They treat me like aids patent.

