In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
import pandas as pd
import re
import nltk
nltk.download('punkt')

df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
reviews = df['review'].values
sentiments = df['sentiment'].values

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return word_tokenize(text)

tokenized_reviews = [preprocess_text(review) for review in reviews]

def load_glove_embeddings(glove_path):
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coeffs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coeffs
    return embeddings_index

glove_path = '/kaggle/input/glove2/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_path)

vocab = set(word for review in tokenized_reviews for word in review)
word_to_index = {word: i+1 for i, word in enumerate(vocab, 1)}
word_to_index[''] = 0
vocab_size = len(word_to_index) + 1

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_to_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

max_len = 100
def text_to_sequence(text, word_to_index, max_len):
    return [word_to_index.get(word, 0) for word in text][:max_len] + [0] * max(0, max_len - len(text))

sequences = [text_to_sequence(review, word_to_index, max_len) for review in tokenized_reviews]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(sentiments)

X_train, X_test, y_train, y_test = train_test_split(sequences, y, test_size=0.2, random_state=42)

def create_data_loader(X, y, batch_size):
    tensor_X = torch.tensor(X, dtype=torch.long)
    tensor_y = torch.tensor(y, dtype=torch.float32)
    dataset = TensorDataset(tensor_X, tensor_y)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_loader = create_data_loader(X_train, y_train, batch_size=128)
test_loader = create_data_loader(X_test, y_test, batch_size=128)

# RNN Model
class GloveRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(GloveRNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        h, _ = self.rnn(x)
        h = h[:, -1, :]
        return self.fc(h)

hidden_dim = 64
model = GloveRNN(vocab_size, embedding_dim, hidden_dim, embedding_matrix)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

def evaluate_model(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs).squeeze()
            preds = torch.sigmoid(outputs).round().long()
            y_true.extend(targets.tolist())
            y_pred.extend(preds.tolist())
    return classification_report(y_true, y_pred, target_names=label_encoder.classes_)

train_model(model, train_loader, epochs=5)
print("Classification Report for GloVe + RNN:")
print(evaluate_model(model, test_loader))


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch 1, Loss: 0.6868285711961813
Epoch 2, Loss: 0.6908077214853451
Epoch 3, Loss: 0.6898445776476266
Epoch 4, Loss: 0.6856517462303844
Epoch 5, Loss: 0.6858390838193437
Classification Report for GloVe + RNN:
              precision    recall  f1-score   support

    negative       0.54      0.54      0.54      4961
    positive       0.54      0.54      0.54      5039

    accuracy                           0.54     10000
   macro avg       0.54      0.54      0.54     10000
weighted avg       0.54      0.54      0.54     10000



In [15]:
#LSTM Model
class GloveLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, embedding_matrix):
        super(GloveLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        h, _ = self.lstm(x)
        h = h[:, -1, :]
        return self.fc(h)

hidden_dim = 64
model = GloveLSTM(vocab_size, embedding_dim, hidden_dim, num_layers=1, embedding_matrix=embedding_matrix)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, epochs=5)
print("Classification Report for GloVe + LSTM:")
print(evaluate_model(model, test_loader))




Epoch 1, Loss: 0.6535816914357316
Epoch 2, Loss: 0.603064070399196
Epoch 3, Loss: 0.5707171725960204
Epoch 4, Loss: 0.5125554716244293
Epoch 5, Loss: 0.49410779026750556
Classification Report for GloVe + LSTM:
              precision    recall  f1-score   support

    negative       0.81      0.71      0.76      4961
    positive       0.74      0.84      0.79      5039

    accuracy                           0.77     10000
   macro avg       0.78      0.77      0.77     10000
weighted avg       0.78      0.77      0.77     10000



In [16]:
#OnTheFly RNN
class OnTheFlyRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(OnTheFlyRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        h, _ = self.rnn(x)
        h = h[:, -1, :]
        return self.fc(h)

model = OnTheFlyRNN(vocab_size, embedding_dim, hidden_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train and evaluate
train_model(model, train_loader, epochs=5)
print("Classification Report for On-the-Fly Embeddings + RNN:")
print(evaluate_model(model, test_loader))

Epoch 1, Loss: 0.6934552581165545
Epoch 2, Loss: 0.6783685343334088
Epoch 3, Loss: 0.6701521563072936
Epoch 4, Loss: 0.6560366377472496
Epoch 5, Loss: 0.6085136558491582
Classification Report for On-the-Fly Embeddings + RNN:
              precision    recall  f1-score   support

    negative       0.68      0.65      0.66      4961
    positive       0.67      0.70      0.68      5039

    accuracy                           0.67     10000
   macro avg       0.67      0.67      0.67     10000
weighted avg       0.67      0.67      0.67     10000



In [17]:
#OnTheFly LSTM
class OnTheFlyLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(OnTheFlyLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        h, _ = self.lstm(x)
        h = h[:, -1, :]
        return self.fc(h)

model = OnTheFlyLSTM(vocab_size, embedding_dim, hidden_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train and evaluate
train_model(model, train_loader, epochs=5)
print("Classification Report for On-the-Fly Embeddings + LSTM:")
print(evaluate_model(model, test_loader))



Epoch 1, Loss: 0.6765606542364858
Epoch 2, Loss: 0.5795277995042527
Epoch 3, Loss: 0.45404979215262414
Epoch 4, Loss: 0.38108627041117454
Epoch 5, Loss: 0.28759989880334835
Classification Report for On-the-Fly Embeddings + LSTM:
              precision    recall  f1-score   support

    negative       0.79      0.86      0.82      4961
    positive       0.85      0.77      0.81      5039

    accuracy                           0.81     10000
   macro avg       0.82      0.81      0.81     10000
weighted avg       0.82      0.81      0.81     10000

