In [35]:
import torch.nn as nn
import torch
import pandas as pd

import torch.optim as optim  
import torch.nn.functional as F  
from torch.utils.data import DataLoader, Dataset  

import nltk
import string
from torch.utils.data import TensorDataset, DataLoader

# Load data

In [2]:
mr_all = pd.read_csv('data/mr_all.csv')

# Preprocess

In [9]:
nltk.data.path.append("./data")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))


In [10]:
def preprocess_text(text, remove_stopwords=True):
    text = text.lower()  # Lowercase
    text = "".join([ch for ch in text if ch not in string.punctuation])  # Remove punctuation
    tokens = text.split()  # Tokenize

    if remove_stopwords:
        tokens = [word for word in tokens if word not in stop_words]

    return tokens  # return as list of words

In [13]:
mr_all["tokens"] = mr_all["text"].apply(preprocess_text)

In [16]:
all_words = [word for tokens in mr_all["tokens"] for word in tokens]
print(f"Total tokens: {len(all_words)}, Unique words: {len(set(all_words))}")

Total tokens: 113569, Unique words: 20359


In [17]:
mr_all
all_words

['rock',
 'destined',
 '21st',
 'centurys',
 'new',
 'conan',
 'hes',
 'going',
 'make',
 'splash',
 'even',
 'greater',
 'arnold',
 'schwarzenegger',
 'jeanclaud',
 'van',
 'damme',
 'steven',
 'segal',
 'gorgeously',
 'elaborate',
 'continuation',
 'lord',
 'rings',
 'trilogy',
 'huge',
 'column',
 'words',
 'cannot',
 'adequately',
 'describe',
 'cowriterdirector',
 'peter',
 'jacksons',
 'expanded',
 'vision',
 'j',
 'r',
 'r',
 'tolkiens',
 'middleearth',
 'effective',
 'tootepid',
 'biopic',
 'sometimes',
 'like',
 'go',
 'movies',
 'fun',
 'wasabi',
 'good',
 'place',
 'start',
 'emerges',
 'something',
 'rare',
 'issue',
 'movie',
 'thats',
 'honest',
 'keenly',
 'observed',
 'doesnt',
 'feel',
 'like',
 'one',
 'film',
 'provides',
 'great',
 'insight',
 'neurotic',
 'mindset',
 'comics',
 'even',
 'reached',
 'absolute',
 'top',
 'game',
 'offers',
 'rare',
 'combination',
 'entertainment',
 'education',
 'perhaps',
 'picture',
 'ever',
 'made',
 'literally',
 'showed',
 'roa

# Label Encoding

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
mr_all["label"] = le.fit_transform(mr_all["label"])  # Now 0 or 1

In [19]:
from collections import Counter

word_counts = Counter(all_words)
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.most_common())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1


In [24]:
mr_all

Unnamed: 0,text,label,tokens,token_ids
0,the rock is destined to be the 21st century's ...,1,"[rock, destined, 21st, centurys, new, conan, h...","[533, 2417, 3118, 10163, 34, 7008, 215, 148, 1..."
1,"the gorgeously elaborate continuation of "" the...",1,"[gorgeously, elaborate, continuation, lord, ri...","[3119, 1951, 7010, 4374, 2724, 4375, 948, 7011..."
2,effective but too-tepid biopic,1,"[effective, tootepid, biopic]","[562, 10167, 1785]"
3,if you sometimes like to go to the movies to h...,1,"[sometimes, like, go, movies, fun, wasabi, goo...","[152, 5, 107, 18, 56, 7014, 9, 212, 535]"
4,"emerges as something rare , an issue movie tha...",1,"[emerges, something, rare, issue, movie, thats...","[1205, 36, 276, 1641, 3, 41, 399, 5349, 3663, ..."
...,...,...,...,...
10657,"this picture is murder by numbers , and as eas...",0,"[picture, murder, numbers, easy, bored, abcs, ...","[84, 707, 1201, 277, 1546, 20356, 113, 20357, ..."
10658,hilarious musical comedy though stymied by acc...,0,"[hilarious, musical, comedy, though, stymied, ...","[370, 1231, 10, 59, 20358, 2789, 2134, 9050]"
10659,"if you are into splatter movies , then you wil...",0,"[splatter, movies, probably, reasonably, good,...","[20359, 18, 227, 1456, 9, 11, 4260, 1042]"
10660,"a dull , simple-minded and stereotypical tale ...",0,"[dull, simpleminded, stereotypical, tale, drug...","[172, 4341, 3626, 87, 1792, 343, 20360, 6979, ..."


In [23]:
def tokens_to_ids(tokens, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

mr_all["token_ids"] = mr_all["tokens"].apply(lambda tokens: tokens_to_ids(tokens, vocab))


In [25]:
mr_all

Unnamed: 0,text,label,tokens,token_ids
0,the rock is destined to be the 21st century's ...,1,"[rock, destined, 21st, centurys, new, conan, h...","[533, 2417, 3118, 10163, 34, 7008, 215, 148, 1..."
1,"the gorgeously elaborate continuation of "" the...",1,"[gorgeously, elaborate, continuation, lord, ri...","[3119, 1951, 7010, 4374, 2724, 4375, 948, 7011..."
2,effective but too-tepid biopic,1,"[effective, tootepid, biopic]","[562, 10167, 1785]"
3,if you sometimes like to go to the movies to h...,1,"[sometimes, like, go, movies, fun, wasabi, goo...","[152, 5, 107, 18, 56, 7014, 9, 212, 535]"
4,"emerges as something rare , an issue movie tha...",1,"[emerges, something, rare, issue, movie, thats...","[1205, 36, 276, 1641, 3, 41, 399, 5349, 3663, ..."
...,...,...,...,...
10657,"this picture is murder by numbers , and as eas...",0,"[picture, murder, numbers, easy, bored, abcs, ...","[84, 707, 1201, 277, 1546, 20356, 113, 20357, ..."
10658,hilarious musical comedy though stymied by acc...,0,"[hilarious, musical, comedy, though, stymied, ...","[370, 1231, 10, 59, 20358, 2789, 2134, 9050]"
10659,"if you are into splatter movies , then you wil...",0,"[splatter, movies, probably, reasonably, good,...","[20359, 18, 227, 1456, 9, 11, 4260, 1042]"
10660,"a dull , simple-minded and stereotypical tale ...",0,"[dull, simpleminded, stereotypical, tale, drug...","[172, 4341, 3626, 87, 1792, 343, 20360, 6979, ..."


In [26]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Convert to list of tensors
sequence_tensors = [torch.tensor(seq) for seq in mr_all["token_ids"]]

# Pad
padded_seqs = pad_sequence(sequence_tensors, batch_first=True, padding_value=vocab["<PAD>"])
label_tensor = torch.tensor(mr_all["label"].values)


In [29]:
padded_seqs

tensor([[  533,  2417,  3118,  ...,     0,     0,     0],
        [ 3119,  1951,  7010,  ...,     0,     0,     0],
        [  562, 10167,  1785,  ...,     0,     0,     0],
        ...,
        [20359,    18,   227,  ...,     0,     0,     0],
        [  172,  4341,  3626,  ...,     0,     0,     0],
        [ 2154,  6673,  3450,  ...,     0,     0,     0]])

# dataset defintion

In [30]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, padded_seqs, labels):
        self.seqs = padded_seqs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.seqs[idx], self.labels[idx]

In [31]:
dataset = ReviewDataset(padded_seqs, label_tensor)

In [34]:
from sklearn.model_selection import train_test_split

# Assume `padded_seqs` and `label_tensor` are torch.Tensors
X_train, X_test, y_train, y_test = train_test_split(
    padded_seqs, label_tensor, test_size=0.2, random_state=42
)


In [58]:
batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
test_dataset  = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_loader  = DataLoader(test_dataset, batch_size=32)

## Model


In [50]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

In [61]:
vocab_to_int = {word:idx+1 for idx, word in enumerate(vocab)}

In [62]:
n_vocab = len(vocab_to_int)
n_embed = 400
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [63]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net.to(device)


SentimentLSTM(
  (embedding): Embedding(20361, 400)
  (lstm): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [64]:
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)


In [44]:
def binary_accuracy(preds, y):
    rounded = torch.round(torch.sigmoid(preds))  # sigmoid to get 0–1
    correct = (rounded == y).float()
    return correct.sum() / len(correct)


In [65]:
def train(model, dataloader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device).float()

        optimizer.zero_grad()
        preds = model(x).squeeze(1)  # [batch]
        loss = criterion(preds, y)
        acc = binary_accuracy(preds, y)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device).float()
            preds = model(x).squeeze(1)
            loss = criterion(preds, y)
            acc = binary_accuracy(preds, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


In [69]:
print_every = 100
step = 0
n_epochs = 4  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()

  nn.utils.clip_grad_norm(net.parameters(), clip)


RuntimeError: shape '[32, -1]' is invalid for input of size 663

In [None]:
net.eval()

In [67]:
model = net

In [68]:
EPOCHS = 5

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    #val_loss, val_acc = evaluate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%")



AttributeError: 'tuple' object has no attribute 'squeeze'