In [None]:
!pip install -r requirements.txt

Collecting datasets (from -r requirements.txt (line 3))
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014

In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load the IMDB dataset (already balanced between positive and negative)
imdb = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Convert to pandas DataFrame
train_data = pd.DataFrame({
    'text': imdb['train']['text'],
    'sentiment': imdb['train']['label']  # 0 is negative, 1 is positive
})

In [None]:
# Clean the text data
def clean_text(text):
  # Check if text is a string, otherwise convert it to string
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

train_data['cleaned_text'] = train_data['text'].apply(clean_text)

In [None]:
# Check class balance
print("IMDB dataset sentiment distribution:")
print(train_data['sentiment'].value_counts())

IMDB dataset sentiment distribution:
sentiment
0    12500
1    12500
Name: count, dtype: int64


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    train_data['cleaned_text'],
    train_data['sentiment'],
    test_size=0.3,
    random_state=42,
    stratify=train_data['sentiment']
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

# Verify splits have both classes
print("\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print("\nValidation set class distribution:")
print(pd.Series(y_val).value_counts())
print("\nTest set class distribution:")
print(pd.Series(y_test).value_counts())


Training set class distribution:
sentiment
1    8750
0    8750
Name: count, dtype: int64

Validation set class distribution:
sentiment
1    1875
0    1875
Name: count, dtype: int64

Test set class distribution:
sentiment
1    1875
0    1875
Name: count, dtype: int64


In [None]:

# Create a dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=2):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop1 = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 256)
        self.drop2 = nn.Dropout(p=0.3)
        self.fc2 = nn.Linear(256, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        pooled_output = outputs.pooler_output
        x = self.drop1(pooled_output)
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.drop2(x)
        return self.fc2(x)
        # Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
val_dataset = SentimentDataset(X_val, y_val, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
# Initialize the model
model = SentimentClassifier()
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Training function
def train_model(model, train_loader, val_loader, epochs=4):
    # Set up optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, factor=0.5)
    loss_fn = nn.CrossEntropyLoss().to(device)

    best_accuracy = 0

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")

        # Training
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Calculate loss
            loss = loss_fn(outputs, labels)
            train_loss += loss.item()
            # Backward pass and optimize
            loss.backward()
            optimizer.step()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        predictions = []
        actual_labels = []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)


                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

                # Calculate loss
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()

                # Get predictions
                _, preds = torch.max(outputs, dim=1)
                 # Store predictions and true labels
                predictions.extend(preds.cpu().tolist())
                actual_labels.extend(labels.cpu().tolist())

        avg_val_loss = val_loss / len(val_loader)
        accuracy = accuracy_score(actual_labels, predictions)
        scheduler.step(avg_val_loss)

        print(f"Validation loss: {avg_val_loss:.4f}")
        print(f"Validation accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        # Add the labels parameter to classification_report
        print(classification_report(actual_labels, predictions, labels=np.unique(actual_labels), target_names=['Negative', 'Positive']))

        # Save best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_sentiment_model.pth')
            print(f"Best model saved with accuracy: {best_accuracy:.4f}")


            # Train the model
train_model(model, train_loader, val_loader, epochs=8)

# Load the best model
model.load_state_dict(torch.load('best_sentiment_model.pth'))


Epoch 1/8


Training: 100%|██████████| 1094/1094 [06:42<00:00,  2.72it/s]


Average training loss: 0.3704


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.70it/s]


Validation loss: 0.2732
Validation accuracy: 0.8861

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.87      0.88      1875
    Positive       0.87      0.91      0.89      1875

    accuracy                           0.89      3750
   macro avg       0.89      0.89      0.89      3750
weighted avg       0.89      0.89      0.89      3750

Best model saved with accuracy: 0.8861

Epoch 2/8


Training: 100%|██████████| 1094/1094 [06:55<00:00,  2.63it/s]


Average training loss: 0.2098


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.71it/s]


Validation loss: 0.2845
Validation accuracy: 0.8840

Classification Report:
              precision    recall  f1-score   support

    Negative       0.91      0.85      0.88      1875
    Positive       0.86      0.92      0.89      1875

    accuracy                           0.88      3750
   macro avg       0.89      0.88      0.88      3750
weighted avg       0.89      0.88      0.88      3750


Epoch 3/8


Training: 100%|██████████| 1094/1094 [06:55<00:00,  2.63it/s]


Average training loss: 0.1109


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.65it/s]


Validation loss: 0.3317
Validation accuracy: 0.8813

Classification Report:
              precision    recall  f1-score   support

    Negative       0.89      0.87      0.88      1875
    Positive       0.87      0.89      0.88      1875

    accuracy                           0.88      3750
   macro avg       0.88      0.88      0.88      3750
weighted avg       0.88      0.88      0.88      3750


Epoch 4/8


Training: 100%|██████████| 1094/1094 [06:55<00:00,  2.63it/s]


Average training loss: 0.0447


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.70it/s]


Validation loss: 0.4279
Validation accuracy: 0.8784

Classification Report:
              precision    recall  f1-score   support

    Negative       0.85      0.92      0.88      1875
    Positive       0.91      0.84      0.87      1875

    accuracy                           0.88      3750
   macro avg       0.88      0.88      0.88      3750
weighted avg       0.88      0.88      0.88      3750


Epoch 5/8


Training: 100%|██████████| 1094/1094 [06:55<00:00,  2.63it/s]


Average training loss: 0.0257


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.72it/s]


Validation loss: 0.4702
Validation accuracy: 0.8864

Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.90      0.89      1875
    Positive       0.90      0.87      0.89      1875

    accuracy                           0.89      3750
   macro avg       0.89      0.89      0.89      3750
weighted avg       0.89      0.89      0.89      3750

Best model saved with accuracy: 0.8864

Epoch 6/8


Training: 100%|██████████| 1094/1094 [06:55<00:00,  2.63it/s]


Average training loss: 0.0143


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.71it/s]


Validation loss: 0.5425
Validation accuracy: 0.8867

Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.89      0.89      1875
    Positive       0.89      0.88      0.89      1875

    accuracy                           0.89      3750
   macro avg       0.89      0.89      0.89      3750
weighted avg       0.89      0.89      0.89      3750

Best model saved with accuracy: 0.8867

Epoch 7/8


Training: 100%|██████████| 1094/1094 [06:55<00:00,  2.63it/s]


Average training loss: 0.0093


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.69it/s]


Validation loss: 0.5800
Validation accuracy: 0.8845

Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.89      0.88      1875
    Positive       0.89      0.88      0.88      1875

    accuracy                           0.88      3750
   macro avg       0.88      0.88      0.88      3750
weighted avg       0.88      0.88      0.88      3750


Epoch 8/8


Training: 100%|██████████| 1094/1094 [06:56<00:00,  2.63it/s]


Average training loss: 0.0083


Validating: 100%|██████████| 235/235 [00:41<00:00,  5.63it/s]


Validation loss: 0.6149
Validation accuracy: 0.8867

Classification Report:
              precision    recall  f1-score   support

    Negative       0.89      0.88      0.89      1875
    Positive       0.88      0.90      0.89      1875

    accuracy                           0.89      3750
   macro avg       0.89      0.89      0.89      3750
weighted avg       0.89      0.89      0.89      3750



<All keys matched successfully>

In [None]:
#Evaluate on test set
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actual_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get predictions
            _, preds = torch.max(outputs, dim=1)
            # Store predictions and labels
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    # Calculate accuracy
    accuracy = accuracy_score(actual_labels, predictions)

    print(f"\nTest accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(actual_labels, predictions, target_names=['Negative', 'Positive']))

# Evaluate the model
evaluate_model(model, test_loader)

Testing: 100%|██████████| 235/235 [00:41<00:00,  5.73it/s]


Test accuracy: 0.8848

Classification Report:
              precision    recall  f1-score   support

    Negative       0.87      0.90      0.89      1875
    Positive       0.90      0.87      0.88      1875

    accuracy                           0.88      3750
   macro avg       0.89      0.88      0.88      3750
weighted avg       0.89      0.88      0.88      3750






In [None]:
# Function to predict sentiment for new texts
def predict_sentiment(text, model, tokenizer):
    # Clean the text
    text = clean_text(text)

    # Tokenize
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
        )

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get prediction
    with torch.no_grad():
        model.eval()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs, dim=1)

    sentiment_label = "Positive" if prediction.item() == 1 else "Negative"
    return sentiment_label
    # Test with some examples
test_texts = [
    "I absolutely love this product! It's amazing!",
    "This was a terrible experience, I'm very disappointed.",
    "The service was okay, nothing special."
]

for text in test_texts:
    sentiment = predict_sentiment(text, model, tokenizer)
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment}\n")

Text: I absolutely love this product! It's amazing!
Sentiment: Positive

Text: This was a terrible experience, I'm very disappointed.
Sentiment: Negative

Text: The service was okay, nothing special.
Sentiment: Negative

