# With optuna

```
python


import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.pytorch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset

# 1. Load Twitter Dataset
df = pd.read_csv("https://raw.githubusercontent.com/PratishMashankar/twitter-sentiment-analysis/refs/heads/master/data/Twitter_Data.csv")

# Map Sentiments to Numeric
df['category'] = df['category'].map({'Positive': 1, 'Negative': 0})

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['clean_text'].tolist(),
    df['category'].tolist(),
    test_size=0.2,
    random_state=42
)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

class TwitterDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 2. Train Model Function
def train_model(model, train_loader, val_loader, optimizer, scheduler, criterion, device, trial_number):
    best_val_loss = np.inf
    patience_counter = 0
    patience = 3
    num_epochs = 5

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop('label')

            optimizer.zero_grad()
            outputs = model(**batch)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop('label')
                outputs = model(**batch)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        mlflow.log_metric("val_loss", avg_val_loss, step=epoch)

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), f"best_model_trial{trial_number}.pth")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break

    return model

# 3. Log Parameters
def log_params(trial):
    params = {
        "learning_rate": trial.params["learning_rate"],
        "batch_size": trial.params["batch_size"]
    }
    mlflow.log_params(params)

# 4. Evaluate Model
def evaluate_model(model, dataloader):
    device = next(model.parameters()).device
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop('label')
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, output_dict=True)
    conf_matrix = confusion_matrix(all_labels, all_preds)

    return accuracy, report, conf_matrix

# 5. Log Metrics and Artifacts
def log_metrics_artifacts(accuracy, report, conf_matrix, trial_number, model):
    mlflow.log_metric("test_accuracy", accuracy)

    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(f"{label}_{metric_name}", metric_value)

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    conf_matrix_path = f"confusion_matrix_trial{trial_number}.png"
    plt.savefig(conf_matrix_path)
    mlflow.log_artifact(conf_matrix_path)

    mlflow.pytorch.log_model(model, "models/distilbert_model")

# 6. Objective Function
def objective(trial):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])

    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_dataset = TwitterDataset(train_texts, train_labels, tokenizer)
    val_dataset = TwitterDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    num_training_steps = len(train_loader) * 5
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    criterion = nn.CrossEntropyLoss()

    mlflow.set_experiment("DistilBERT_Twitter_Optuna")
    with mlflow.start_run(run_name=f"Trial_{trial.number}"):
        log_params(trial)

        model = train_model(model, train_loader, val_loader, optimizer, scheduler, criterion, device, trial.number)
        model.load_state_dict(torch.load(f"best_model_trial{trial.number}.pth"))

        accuracy, report, conf_matrix = evaluate_model(model, val_loader)
        log_metrics_artifacts(accuracy, report, conf_matrix, trial.number, model)

    return 1.0 - accuracy

# 7. Run Optuna Study
def run_optuna_study(n_trials=10):
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    best_accuracy = 1.0 - study.best_value

    print("Best hyperparameters:", best_params)
    print("Best accuracy:", best_accuracy)

    return study

# 8. Execute
best_study = run_optuna_study(n_trials=10)'

```


# Without optuna

In [1]:
!pip install torch transformers mlflow dagshub



In [2]:
import torch
import dagshub
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.pytorch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset


import mlflow
import dagshub
import json

# Load JSON from file
with open('/content/youtube_dagshub.json', 'r', encoding='utf-8') as f:
    token = json.load(f)

DAGSHUB_USER="Prayesh13"
DAGSHUB_REPO="youtube-comments-analysis"


mlflow.set_tracking_uri(f"https://dagshub.com/{DAGSHUB_USER}/{DAGSHUB_REPO}.mlflow")
dagshub.auth.add_app_token(token['youtube_dagshub'])
dagshub.init(repo_owner='Prayesh13', repo_name='youtube-comments-analysis', mlflow=True, )




In [3]:
# 1. Load Twitter Dataset
df = pd.read_csv("/content/sentiment_processed.csv")

# Map Sentiments to Numeric
df['label'] = df['label'].map({-1:0, 0:1, 1:2})

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['content'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
class TwitterDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 2. Prepare DataLoaders
train_dataset = TwitterDataset(train_texts, train_labels, tokenizer)
val_dataset = TwitterDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

```
python

# 3. Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
num_training_steps = len(train_loader) * 5
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# 4. Training Loop
num_epochs = 5
best_val_loss = np.inf
patience = 3
patience_counter = 0

mlflow.set_experiment("Pretrained Model")
with mlflow.start_run(run_name="distilbert"):
    mlflow.set_tag("model_type", "distilbert")
    mlflow.log_param("model_name", "distilbert-base-uncased")
    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop('label')

            optimizer.zero_grad()
            outputs = model(**batch)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

        # Validation
        model.eval()
        val_loss = 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop('label')
                outputs = model(**batch)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        accuracy = accuracy_score(all_labels, all_preds)
        mlflow.log_metric("val_loss", avg_val_loss, step=epoch)
        mlflow.log_metric("val_accuracy", accuracy, step=epoch)

        print(f"Epoch {epoch} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Accuracy: {accuracy:.4f}")

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_baseline_model.pth")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered!")
                break

    # Final Evaluation
    report = classification_report(all_labels, all_preds, output_dict=True)
    conf_matrix = confusion_matrix(all_labels, all_preds)

    # Log Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.savefig("confusion_matrix_baseline.png")
    mlflow.log_artifact("confusion_matrix_baseline.png")

    # Log Model
    mlflow.pytorch.log_model(model, "models/distilbert_baseline")
```

In [5]:
import os

# ... (Your existing code) ...

# Before the training loop
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
# Before loading the model
torch.cuda.empty_cache()

# 3. Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
num_training_steps = len(train_loader) * 5
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# 4. Training Loop
num_epochs = 5
best_val_loss = np.inf
patience = 3
patience_counter = 0

mlflow.set_experiment("Pretrained Model")
with mlflow.start_run(run_name="distilbert"):
    mlflow.set_tag("model_type", "distilbert")
    mlflow.log_param("model_name", "distilbert-base-uncased")
    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()} # Move batch to device
            labels = batch.pop('label')

            optimizer.zero_grad()
            outputs = model(**batch)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()


        avg_train_loss = total_loss / len(train_loader)
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

        # Validation
        model.eval()
        val_loss = 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop('label')
                outputs = model(**batch)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        accuracy = accuracy_score(all_labels, all_preds)
        mlflow.log_metric("val_loss", avg_val_loss, step=epoch)
        mlflow.log_metric("val_accuracy", accuracy, step=epoch)

        print(f"Epoch {epoch} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Accuracy: {accuracy:.4f}")

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_baseline_model.pth")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered!")
                break

    # Final Evaluation
    report = classification_report(all_labels, all_preds, output_dict=True)
    conf_matrix = confusion_matrix(all_labels, all_preds)

    # Log Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.savefig("confusion_matrix_baseline.png")
    mlflow.log_artifact("confusion_matrix_baseline.png")

    # Log Model
    mlflow.pytorch.log_model(model, "models/distilbert_baseline")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# !pip install --upgrade --force-reinstall transformers