In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from tqdm import tqdm


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")


In [8]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [9]:
df = df.sample(n=5000, random_state=42)


In [10]:
import spacy
nlp = spacy.load("en_core_web_sm")
def tokenize_review(review):
    tokens = [token.text for token in nlp(review.lower()) if not token.is_stop and not token.is_punct]
    return tokens

In [11]:
df["tokenize_review"] = df["review"].apply(tokenize_review)

In [12]:
word_counts = Counter([token for review in df['tokenize_review'] for token in review])
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_index = {word: idx+1 for idx, word in enumerate(vocab)}  # Add 1 to reserve index 0 for padding



In [15]:
vocab_file_path='vocab_data'
# Open the file in write mode
with open(vocab_file_path, "w") as file:
    # Write each word in the vocab list to the file
    for word in vocab:
        file.write(word + "\n")

print("Vocabulary saved to:", vocab_file_path)

Vocabulary saved to: vocab_data


In [16]:
word_to_index_file_path = "word_to_index.txt"

# Open the file in write mode
with open(word_to_index_file_path, "w") as file:
    # Write each key-value pair in the word_to_index dictionary to the file
    for word, index in word_to_index.items():
        file.write(f"{word}: {index}\n")

print("Word to Index mapping saved to:", word_to_index_file_path)

Word to Index mapping saved to: word_to_index.txt


In [17]:
def numericalize_review(review_tokens):
    return [word_to_index[token] for token in review_tokens]

df["numericalized_review"] = df["tokenize_review"].apply(numericalize_review)


In [18]:
df.to_csv("review.csv", index=False)

In [19]:
max_len = max(df["numericalized_review"].apply(len))
def pad_sequence(sequence, max_len):
    return sequence + [0]*(max_len - len(sequence))

df["padded_review"] = df["numericalized_review"].apply(lambda x: pad_sequence(x, max_len))




In [20]:
X_train, X_test, y_train, y_test = train_test_split(df["padded_review"].values, df["sentiment"].values, test_size=0.2, random_state=42)



In [21]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])


In [22]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        out = self.fc(lstm_out)
        return out


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
batch_size = 64
num_workers = 2

In [25]:
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)



In [26]:
test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)



In [27]:
vocab_size = len(vocab) + 1  # Add 1 for padding token
embedding_dim = 128
hidden_dim = 256
output_dim = 1
n_layers = 2
dropout = 0.5
batch_size = 64
learning_rate = 0.001
num_epochs = 3


In [28]:
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout).to(device)


In [29]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())


In [30]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for X_batch, y_batch in tqdm(iterator, desc='Training'):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()

        predictions = model(X_batch).squeeze(1)
        loss = criterion(predictions, y_batch.float())
        acc = accuracy_score(torch.round(torch.sigmoid(predictions)).cpu().detach().numpy(), y_batch.cpu().detach().numpy())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [31]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for X_batch, y_batch in tqdm(iterator, desc='Evaluating'):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            predictions = model(X_batch).squeeze(1)
            loss = criterion(predictions, y_batch.float())
            acc = accuracy_score(torch.round(torch.sigmoid(predictions)).cpu().detach().numpy(), y_batch.cpu().detach().numpy())

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [32]:
N_EPOCHS = 3

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_loader, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-model.pt')

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  self.pid = os.fork()
Training: 100%|██████████| 63/63 [46:33<00:00, 44.34s/it]
  self.pid = os.fork()
Evaluating: 100%|██████████| 16/16 [00:26<00:00,  1.65s/it]


Epoch: 01
	Train Loss: 0.695 | Train Acc: 50.25%
	 Val. Loss: 0.693 |  Val. Acc: 50.47%


  self.pid = os.fork()
Training: 100%|██████████| 63/63 [41:56<00:00, 39.95s/it]
  self.pid = os.fork()
Evaluating: 100%|██████████| 16/16 [00:26<00:00,  1.63s/it]


Epoch: 02
	Train Loss: 0.694 | Train Acc: 49.21%
	 Val. Loss: 0.693 |  Val. Acc: 49.53%


  self.pid = os.fork()
Training: 100%|██████████| 63/63 [40:02<00:00, 38.13s/it]
  self.pid = os.fork()
Evaluating: 100%|██████████| 16/16 [00:26<00:00,  1.63s/it]

Epoch: 03
	Train Loss: 0.694 | Train Acc: 48.56%
	 Val. Loss: 0.694 |  Val. Acc: 49.53%





In [33]:
model.load_state_dict(torch.load('best-model.pt'))
model.eval()

LSTMModel(
  (embedding): Embedding(47041, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [34]:
# Load necessary libraries
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the new dataset
new_df = pd.read_csv("/content/drive/MyDrive/new file")

In [35]:

# Define a function for making predictions
def predict(model, data_loader):
    predictions = []
    targets = []

    model.eval()

    with torch.no_grad():
        for X_batch, y_batch in tqdm(data_loader, desc='Predicting'):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch).squeeze(1)
            preds = torch.round(torch.sigmoid(outputs))

            predictions.extend(preds.cpu().detach().numpy())
            targets.extend(y_batch.cpu().detach().numpy())

    return predictions, targets

# Make predictions on the test dataset
test_predictions, test_targets = predict(model, test_loader)

# Calculate evaluation metrics
test_accuracy = accuracy_score(test_targets, test_predictions)
print("Test Accuracy:", test_accuracy)
print("Classification Report:")
print(classification_report(test_targets, test_predictions))
print("Confusion Matrix:")
print(confusion_matrix(test_targets, test_predictions))


  self.pid = os.fork()
  self.pid = os.fork()
Predicting: 100%|██████████| 16/16 [00:27<00:00,  1.73s/it]

Test Accuracy: 0.506
Classification Report:
              precision    recall  f1-score   support

           0       0.51      1.00      0.67       506
           1       0.00      0.00      0.00       494

    accuracy                           0.51      1000
   macro avg       0.25      0.50      0.34      1000
weighted avg       0.26      0.51      0.34      1000

Confusion Matrix:
[[506   0]
 [494   0]]



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
