In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset/IMDB Dataset.csv')

In [3]:
import re
import string
from nltk.corpus import stopwords

# Ensure you have the stopwords downloaded
import nltk
# 1. Text Cleaning
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    text = text.strip()
    return text

df['cleaned_text'] = df['review'].apply(clean_text)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
# 4. Frequency-Based Vectors (CountVectorizer)

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train_df['cleaned_text'])
X_test_counts = vectorizer.transform(test_df['cleaned_text'])

In [7]:
# 5. Word Vectors (Word2Vec)
tokenized_corpus = [word_tokenize(text) for text in train_df['cleaned_text']]
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

(46375153, 48637980)

In [8]:
# Function to convert a text to a Word2Vec vector
def text_to_w2v(text, model, vector_size):
    words = word_tokenize(text)
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(word_vecs) == 0:
        return torch.zeros(vector_size)
    return torch.tensor(sum(word_vecs) / len(word_vecs))

train_w2v = torch.stack([text_to_w2v(text, w2v_model, 100) for text in train_df['cleaned_text']])
test_w2v = torch.stack([text_to_w2v(text, w2v_model, 100) for text in test_df['cleaned_text']])

In [9]:
# 6. Custom Dataset for DataLoader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = TextDataset(train_w2v, torch.tensor(train_df['label'].values))
test_dataset = TextDataset(test_w2v, torch.tensor(test_df['label'].values))

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [10]:
# 7. RNN Model and Word2vec in PyTorch
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x.unsqueeze(1), h0)
        out = self.fc(out[:, -1, :])
        return out

# Initialize model, criterion, and optimizer
input_size = 100  # Same as Word2Vec vector size
hidden_size = 50
output_size = 2  # Binary classification

model = RNNModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
# 8. Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs.float())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

In [12]:
# 9. Testing Loop
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs.float())
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy of the model on the test set: {100 * correct / total:.2f}%')

In [13]:
# Train and test the model
train_model(model, train_loader, criterion, optimizer, epochs=15)
test_model(model, test_loader)

Epoch [1/15], Loss: 0.3326
Epoch [2/15], Loss: 0.3189
Epoch [3/15], Loss: 0.3146
Epoch [4/15], Loss: 0.3100
Epoch [5/15], Loss: 0.3065
Epoch [6/15], Loss: 0.3031
Epoch [7/15], Loss: 0.3008
Epoch [8/15], Loss: 0.2989
Epoch [9/15], Loss: 0.2979
Epoch [10/15], Loss: 0.2958
Epoch [11/15], Loss: 0.2946
Epoch [12/15], Loss: 0.2931
Epoch [13/15], Loss: 0.2911
Epoch [14/15], Loss: 0.2901
Epoch [15/15], Loss: 0.2881
Accuracy of the model on the test set: 87.27%


In [21]:
#LSTM and GLoVE
import numpy as np

def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

In [22]:
glove_file = "/kaggle/input/glove-6b/glove.6B.100d.txt"  # Replace with the correct path to the GloVe file
embeddings_index = load_glove_embeddings(glove_file)

In [23]:
def text_to_glove(text, embeddings_index, vector_size):
    words = word_tokenize(text)
    word_vecs = [embeddings_index[word] for word in words if word in embeddings_index]
    if len(word_vecs) == 0:
        return torch.zeros(vector_size)
    return torch.tensor(sum(word_vecs) / len(word_vecs))

train_glove = torch.stack([text_to_glove(text, embeddings_index, 100) for text in train_df['cleaned_text']])
test_glove = torch.stack([text_to_glove(text, embeddings_index, 100) for text in test_df['cleaned_text']])

In [24]:
# 6. Custom Dataset for DataLoader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = TextDataset(train_glove, torch.tensor(train_df['label'].values))
test_dataset = TextDataset(test_glove, torch.tensor(test_df['label'].values))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [25]:
# 7. LSTM Model in PyTorch
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x.unsqueeze(1), (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [26]:
# Initialize model, criterion, and optimizer
input_size = 100  # GloVe vector size
hidden_size = 50
output_size = 2  # Binary classification

model = LSTMModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [27]:
# 8. Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs.float())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')


In [29]:
# 9. Testing Loop
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs.float())
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy of the model on the test set: {100 * correct / total:.2f}%')

# Train and test the model
train_model(model, train_loader, criterion, optimizer, epochs=20
           )
test_model(model, test_loader)

Epoch [1/20], Loss: 0.4285
Epoch [2/20], Loss: 0.4269
Epoch [3/20], Loss: 0.4255
Epoch [4/20], Loss: 0.4246
Epoch [5/20], Loss: 0.4227
Epoch [6/20], Loss: 0.4206
Epoch [7/20], Loss: 0.4188
Epoch [8/20], Loss: 0.4181
Epoch [9/20], Loss: 0.4166
Epoch [10/20], Loss: 0.4160
Epoch [11/20], Loss: 0.4144
Epoch [12/20], Loss: 0.4132
Epoch [13/20], Loss: 0.4124
Epoch [14/20], Loss: 0.4114
Epoch [15/20], Loss: 0.4111
Epoch [16/20], Loss: 0.4091
Epoch [17/20], Loss: 0.4083
Epoch [18/20], Loss: 0.4070
Epoch [19/20], Loss: 0.4062
Epoch [20/20], Loss: 0.4056
Accuracy of the model on the test set: 80.76%
