In [None]:
from pathlib import Path

import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn.feature_extraction.text import CountVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook,notebook

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device 


device(type='cpu')

In [None]:
DATA_PATH = 'data/imdb_reviews.csv'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id = '1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
        dest_path = DATA_PATH 

    )

In [None]:
class Sequences(Dataset):
    def __init__(self, path, max_seq_len):
        self.max_seq_len = max_seq_len
        df = pd.read_csv(path)
        vectorizer = CountVectorizer(stop_words='english', min_df = 0.015)
        #self.dfs=df.review.tolist()
        vectorizer.fit(df.review.tolist())
        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) +1
        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x) if token in self.token2idx]
        self.pad = lambda x : x +(max_seq_len - len(x)) * [self.token2idx['<PAD>']]
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df.review.tolist()]
        sequences , self.labels = zip(*[(sequence,label) for sequence, label in zip(sequences,df.label.tolist()) if sequence])
        self.sequences =[self.pad(sequence) for  sequence in sequences]

    def __getitem__(self, i ):
            assert len(self.sequences[i]) == self.max_seq_len
            return self.sequences[i] , self.labels[i]


    def __len__(self):
            return len(self.sequences)

In [None]:
dataset = Sequences(DATA_PATH,max_seq_len=128)



In [None]:
len(dataset.token2idx)

1104

In [None]:
def collate(batch ):

    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs , target

batch_size = 2048
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)


In [54]:
class RNN(nn.Module):
    def __init__(
        self, 
        vocab_size , 
        batch_size,
        embedding_dimension =100,
        hidden_size = 128,
        n_layers = 1,
        device='cpu'
    ):

        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        self.batch_size = batch_size

        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension, 
            hidden_size,
            num_layers = n_layers, 
            batch_first= True,
        )
        self.decoder = nn.Linear(hidden_size ,1)

    def init_hidden(self):
        return torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device)
    
    def forward(self, inputs):
        batch_size = inputs.size(0)
        if batch_size != self.batch_size:
            self.batch_size = batch_size
        encoded  = self.encoder(inputs)
        print("enc")
        print(encoded.shape)
        output, hidden = self.rnn(encoded, self.init_hidden())
        print("output  ")
        print(output.shape)
        output = self.decoder(output[ : , : ,-1]).squeeze()
        
        return output

In [55]:
model = RNN(
    hidden_size=128,
    vocab_size= len(dataset.token2idx),
    device=device,
    batch_size = batch_size,
)

model = model.to(device)
model


RNN(
  (encoder): Embedding(1104, 100)
  (rnn): GRU(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=1, bias=True)
)

In [56]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr = 0.001)



In [57]:
model.train()
train_losses = []
for epoch in range(10):
    progress_bar = notebook.tqdm(train_loader, leave=False)
    losses = []
    total = 0
    #print("soop")
    for inputs, target in progress_bar:
        inputs, target = inputs.to(device), target.to(device
                                                     )
        model.zero_grad()
        #print("loop")
        output = model(inputs)
    
        loss = criterion(output, target)
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)

    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

  0%|          | 0/31 [00:00<?, ?it/s]

enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
torch.Size([2048, 128, 100])
output  
torch.Size([2048, 128, 128])
enc
to

KeyboardInterrupt: 

In [None]:
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        test_vector=torch.LongTensor([dataset.pad(dataset.encode(text))]).to(device)
        output =model(test_vector)
        prediction = torch.sigmoid(output).item()
        if prediction > 0.5:
            print("positive")
        else:
            print("negative")

        


In [None]:
test_text = """
This poor excuse for a movie is terrible. It has been 'so good it's bad' for a
while, and the high ratings are a good form of sarcasm, I have to admit. But
now it has to stop. Technically inept, spoon-feeding mundane messages with the
artistic weight of an eighties' commercial, hypocritical to say the least, it
deserves to fall into oblivion. Mr. Derek, I hope you realize you are like that
weird friend that everybody know is lame, but out of kindness and Christian
duty is treated like he's cool or something. That works if you are a good
decent human being, not if you are a horrible arrogant bully like you are. Yes,
Mr. 'Daddy' Derek will end on the history books of the internet for being a
delusional sour old man who thinks to be a good example for kids, but actually
has a poster of Kim Jong-Un in his closet. Destroy this movie if you all have a
conscience, as I hope IHE and all other youtube channel force-closed by Derek
out of SPITE would destroy him in the courts.This poor excuse for a movie is
terrible. It has been 'so good it's bad' for a while, and the high ratings are
a good form of sarcasm, I have to admit. But now it has to stop. Technically
inept, spoon-feeding mundane messages with the artistic weight of an eighties'
commercial, hypocritical to say the least, it deserves to fall into oblivion.
Mr. Derek, I hope you realize you are like that weird friend that everybody
know is lame, but out of kindness and Christian duty is treated like he's cool
or something. That works if you are a good decent human being, not if you are a
horrible arrogant bully like you are. Yes, Mr. 'Daddy' Derek will end on the
history books of the internet for being a delusional sour old man who thinks to
be a good example for kids, but actually has a poster of Kim Jong-Un in his
closet. Destroy this movie if you all have a conscience, as I hope IHE and all
other youtube channel force-closed by Derek out of SPITE would destroy him in
the courts.
"""
predict_sentiment(test_text)

negative
