In [1]:
import csv
import gc
import pandas as pd
import nltk
import torch
import torch.nn as nn
import zipfile
import re
nltk.data.path.append('/usr/local/share/nltk_data') 
nltk.download('wordnet', download_dir='/usr/local/share/nltk_data') 
nltk.download('stopwords')  
nltk.download('punkt')
from nltk.corpus import stopwords 
from bs4 import BeautifulSoup
from torch.optim import Adam
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, random_split

[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')  

texts = " ".join(df['review'].tolist()).lower()  

texts = BeautifulSoup(texts, 'html.parser').get_text()  
cleaned_texts = re.sub(r'[^a-zA-Z\s]', ' ', texts).lower()

words = word_tokenize(cleaned_texts)


with zipfile.ZipFile('/usr/local/share/nltk_data/corpora/wordnet.zip', 'r') as zip_ref:
    zip_ref.extractall('/usr/local/share/nltk_data/corpora/')
    
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))  
filtered_lemmatized_words = [  
    lemmatizer.lemmatize(word) for word in words if word not in stop_words  
] 

vocab = sorted(set(filtered_lemmatized_words))  # Unique words only  
vocab.insert(0, '<PAD>')  # Add padding token  

word_to_idx = {word: idx for idx, word in enumerate(vocab)}  
idx_to_word = {idx: word for idx, word in enumerate(vocab)}  

print("Vocabulary Size:", len(vocab))  
print("Sample Vocabulary:", vocab[1310:1320])

Vocabulary Size: 91725
Sample Vocabulary: ['agar', 'agashe', 'agate', 'agatha', 'agathaclosing', 'agbayani', 'age', 'aged', 'agee', 'ageing']


In [3]:
inputs = []
labels = []

for review, sentiment in zip(df['review'], df['sentiment']):
    review = review.lower()

    review = BeautifulSoup(review, 'html.parser').get_text()
    review = re.sub(r'[^a-zA-Z\s]', ' ', review)

    tokens = word_tokenize(review)
    filtered_lemmatized_tokens = [
        lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
    ]
    
    input_indices = [word_to_idx[word] for word in filtered_lemmatized_tokens if word in word_to_idx]
    
    label = 1 if sentiment == 'positive' else 0

    inputs.append(input_indices)
    labels.append(label)


  review = BeautifulSoup(review, 'html.parser').get_text()


In [4]:
max_len = 256
input_tensors = [torch.tensor(input) for input in inputs]
    

# Pad the sequences
padded_inputs = pad_sequence([seq[:max_len] for seq in input_tensors], batch_first=True, padding_value=word_to_idx['<PAD>'])

dataset = TensorDataset(padded_inputs, torch.tensor(labels, dtype=torch.float))
train_ds, valid_ds, test_ds = random_split(dataset, [35000, 7500, 7500])

In [5]:
hyperparameters = {
    "batch_size": 256,
    "embedding_dim": 128,
    "num_embeddings": len(idx_to_word),
    "epochs": 30,
}

In [6]:
train_dl = DataLoader(train_ds, batch_size=hyperparameters['batch_size'], shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
valid_dl = DataLoader(valid_ds, batch_size=hyperparameters['batch_size'], drop_last=True, num_workers=2, pin_memory=True)
test_dl  = DataLoader(test_ds, batch_size=hyperparameters['batch_size'], drop_last=True, num_workers=2, pin_memory=True)

In [7]:
class CustomDL:
    def __init__(self, dataloader, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.dataloader = dataloader
        self.device = device

    def __iter__(self):
        for inputs, targets in self.dataloader:
            inputs = inputs.to(self.device)
            targets = targets.to(self.device)

            yield inputs, targets

            del inputs, targets
            torch.cuda.empty_cache() if self.device == 'cuda' else None
            gc.collect()

    def __len__(self):
        return len(self.dataloader)

custom_train_dl = CustomDL(train_dl)
custom_valid_dl = CustomDL(valid_dl)

In [8]:
class SANetwork(nn.Module):
        
    def __init__(self, embedding_dim, num_embeddings):
        super(SANetwork, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.lstm_cells = nn.LSTM(input_size=embedding_dim, hidden_size=256, num_layers=2, batch_first=True, dropout=.4)

        self.classifier = nn.Sequential(
            nn.Linear(in_features=256, out_features=1),
        )
        
        

    def forward(self, input, hidden=None):
        x = self.embedding(input)
        x, hidden = self.lstm_cells(x, hidden)
        x = x[:, -1, :]
        x = self.classifier(x)
        return x

In [16]:
network = SANetwork(hyperparameters['embedding_dim'], hyperparameters['num_embeddings'])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
network = network.to(device)

In [10]:
print(len(idx_to_word))

91725


In [11]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = Adam(network.parameters())

In [12]:
def train(network, train_loader, val_loader, optimizer, criterion, scheduler, device, num_epochs):
    for epoch in range(num_epochs):
        network.train()

        total_loss = 0.0
        for batch in tqdm(train_loader, desc=f'Training Epoch {epoch + 1}/{num_epochs}', unit='batch'):
            inputs, targets = batch

            optimizer.zero_grad()

            outputs = network(inputs)

            loss = criterion(outputs.view(-1), targets.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Training loss for epoch {epoch + 1}: {avg_loss:.4f}")

        network.eval()
        val_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for val_batch in val_loader:
                val_inputs, val_targets = val_batch
                val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)

                val_outputs = network(val_inputs)

                v_loss = criterion(val_outputs.view(-1), val_targets.view(-1))
                val_loss += v_loss.item()

                predicted = (val_outputs >= 0.5).float()
                correct_predictions += (predicted.view(-1) == val_targets.view(-1)).sum().item()
                total_predictions += val_targets.size(0)

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct_predictions / total_predictions
        print(f"Validation loss for epoch {epoch + 1}: {avg_val_loss:.4f}, Accuracy: {val_accuracy * 100:.2f}%")

        if scheduler is not None:
            scheduler.step()

        torch.save(network.state_dict(), "parameters.pth")

In [16]:
# train(network, custom_train_dl, custom_valid_dl, optimizer, criterion, None, device, hyperparameters['epochs'])

In [14]:
def inference(network, test_loader, criterion, device):
    network.eval()
    test_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad(): 
        for batch in test_loader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = network(inputs)
            outputs = torch.sigmoid(outputs) 

            loss = criterion(outputs.view(-1), targets.view(-1))
            test_loss += loss.item()

            predicted = (outputs >= 0.5).float()

            correct_predictions += (predicted.view(-1) == targets.view(-1)).sum().item()
            total_predictions += targets.size(0)

    avg_test_loss = test_loss / len(test_loader)
    test_accuracy = correct_predictions / total_predictions

    print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f}%")
    return avg_test_loss, test_accuracy

In [15]:
inference(network, test_dl, criterion, device)

Test Loss: 0.5440, Test Accuracy: 94.30%


(0.5440152904082989, 0.9430226293103449)

In [81]:
def exterior_inference(str_=""):  
    network.eval()  
    with torch.no_grad():  
        # Preprocess the input string  
        str_ = str_.lower()  
        str_ = BeautifulSoup(str_, 'html.parser').get_text()  
        str_ = re.sub(r'[^a-zA-Z\s]', ' ', str_)  

        words = word_tokenize(str_)  
        lemmatizer = WordNetLemmatizer()  
        stop_words = set(stopwords.words('english'))  
        processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  

        # Convert words to indices  
        input_ = [word_to_idx[word] for word in processed_words if word in word_to_idx]  
        input_ = torch.tensor(input_).to(device).unsqueeze(dim=0)  

        # Get the model's output  
        result = nn.Sigmoid()(network(input_))  

        # Determine sentiment  
        sentiment_score = result.item()  # Get the score as a float  
        if sentiment_score >= 0.7:  
            sentiment = "Positive"  
            description = "This text conveys positive emotions."  
        elif sentiment_score <= 0.3:  
            sentiment = "Negative"  
            description = "This text conveys negative emotions."  
        else:  
            sentiment = "Neutral"  
            description = "This text is neutral in sentiment."  

        # Print the output  
        print("-" * 50)  
        print(f"Input Text: {str_}")  
        print(f"Sentiment Score: {sentiment_score:.4f} ({sentiment})")  
        print(f"Description: {description}")  
        print("-" * 50)

In [90]:
exterior_inference("I had a horrible experience at the hotel. The room was dirty, and the staff were rude")

--------------------------------------------------
Input Text: i had a horrible experience at the hotel  the room was dirty  and the staff were rude
Sentiment Score: 0.0532 (Negative)
Description: This text conveys negative emotions.
--------------------------------------------------


In [91]:
exterior_inference("I absolutely love this product! It has exceeded all my expectations")

--------------------------------------------------
Input Text: i absolutely love this product  it has exceeded all my expectations
Sentiment Score: 0.9837 (Positive)
Description: This text conveys positive emotions.
--------------------------------------------------
