# Installing Libraries

In [1]:
!pip install torch



In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import torch.nn as nn
import numpy as np
from torch.optim import AdamW # Corrected AdamW import

# Important Variables

In [3]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model and Tokenizer names
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 16
NUM_EPOCHS = 3 # This will be adjusted for gradual unfreezing later
LEARNING_RATE = 2e-5
MAX_LENGTH = 128


Using device: cpu


# Importing Dataset from Kaggle

In [5]:
pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.12-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.0/68.0 kB[0m [31m489.6 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: kagglehub
Successfully installed kagglehub-0.3.12
Note: you may need to restart the kernel to use updated packages.


In [11]:
import kagglehub

# Download latest version
data = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", data)

Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [14]:
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        if filename.endswith('.csv'):
            csv_file_path = os.path.join(dirname, filename)
            print(f"Found CSV file: {csv_file_path}")
            # Load the CSV file into a pandas DataFrame
            data = pd.read_csv(csv_file_path)
            display(data.head())
            break # Assuming there's only one CSV file of interest

Found CSV file: /home/jovyan/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1/IMDB Dataset.csv


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Data Preprocessing & Cleaning 

In [15]:
data.shape

(50000, 2)

In [16]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [17]:
data.isnull().sum()
imdb_data = data.dropna()

In [18]:
imdb_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [19]:
!pip install contractions
!pip install nltk

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m825.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1

In [20]:
import string
import re
import contractions
import nltk

In [21]:
imdb_dataset = imdb_data.iloc[:1000]

In [22]:
imdb_dataset.shape

(1000, 2)

In [23]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_contractions(text):
  expanded_text = contractions.fix(text)
  return expanded_text

def remove_punc(text):
    unwanted = set(string.punctuation + string.digits)
    return ''.join(char for char in str(text) if char not in unwanted)

def remove_punc(text):
    # Ensure input is a string; handle potential NaN/None
    if pd.isna(text):
        return text
    # Keep digits for now if not explicitly asked to remove, but previous context suggests removing
    unwanted = set(string.punctuation + string.digits) # Includes digits as per previous conversation
    return ''.join(char for char in str(text) if char not in unwanted)

def remove_stopwords(words): # Expects a list of words
    if not isinstance(words, list): # Handle cases where input might not be a list (e.g., NaN after tokenization failed)
        return words
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word.lower() not in stop_words]

def lemmatize_words(words): # Expects a list of words
    if not isinstance(words, list): # Handle cases where input might not be a list
        return words
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]



In [24]:
imdb_dataset.columns

Index(['review', 'sentiment'], dtype='object')

In [25]:
# --- 3. Apply the functions sequentially using .loc for 'review' ---

print("Applying text preprocessing to 'review' column...")

# Step 1: Convert review column to string type
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].astype(str)

# Step 2: Convert to lowercase
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].str.lower()

# Step 3: Remove HTML tags
print("- Removing HTML tags...")
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].apply(remove_html_tags)

# Step 4: Remove URLs
print("- Removing URLs...")
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].apply(remove_url)

# Step 5: Remove contractions
print("- Removing contractions...")
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].apply(remove_contractions)

# Step 6: Remove punctuation and digits
print("- Removing punctuation and digits...")
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].apply(remove_punc)

# Step 7: Remove stopwords
print("- Removing stopwords...")
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].apply(remove_stopwords)

# Step 8: Lemmatize words
print("- Lemmatizing words...")
imdb_dataset.loc[:, 'review'] = imdb_dataset['review'].apply(lemmatize_words)

print("\nAll text preprocessing steps completed for 'review'! ✅")
print("\nFirst 5 rows of the processed 'review' column:")
print(imdb_dataset['review'].head())

Applying text preprocessing to 'review' column...
- Removing HTML tags...
- Removing URLs...
- Removing contractions...
- Removing punctuation and digits...
- Removing stopwords...
- Lemmatizing words...

All text preprocessing steps completed for 'review'! ✅

First 5 rows of the processed 'review' column:
0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically there is a family where a little boy...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


In [26]:
print(imdb_dataset['review'].head(5))

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically there is a family where a little boy...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


# Data-Splitting for Training 

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Assumed: imdb_data is already loaded from your previous steps ---
# Make sure 'imdb_data' is the name of your DataFrame with 'review' and 'sentiment' columns
# If not, you'll need to ensure the variable name matches.
# For example, if you loaded it as 'df', then use 'df' below.

# 1. Define your target variable (y)
# This will be the column you want to predict.
target_column = 'sentiment' # Changed to 'sentiment'
y = imdb_dataset[target_column]

# 2. Define your features (X)
# This selects all columns EXCEPT the target_column.
# In this case, 'review' is your feature.
X = imdb_dataset.drop(columns=[target_column]) # 'review' is implicitly selected as it's the only other column

# 3. Perform the train-test split
# We'll use 20% of the data for testing (test_size=0.20) and 80% for training.
# random_state ensures your split is reproducible.
# Since 'sentiment' is a categorical value (classification task), it's good practice to use 'stratify'
# to ensure that the proportions of each sentiment class are maintained in both train and test sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y # Added stratify=y
)

print(f"Data split complete with '{target_column}' as the target.")
print(f"Total Features (X) used: {len(X.columns)} columns")
print(f"Target (y) used: {target_column}")

print(f"\nTraining set samples (features): {len(X_train)} rows")
print(f"Training set samples (target): {len(y_train)} rows")
print(f"Test set samples (features): {len(X_test)} rows")
print(f"Test set samples (target): {len(y_test)} rows")

print("\nFirst 3 rows of X_train (features for training):")
print(X_train.head(3))
print("\nFirst 3 values of y_train (target for training):")
print(y_train.head(3))

Data split complete with 'sentiment' as the target.
Total Features (X) used: 1 columns
Target (y) used: sentiment

Training set samples (features): 800 rows
Training set samples (target): 800 rows
Test set samples (features): 200 rows
Test set samples (target): 200 rows

First 3 rows of X_train (features for training):
                                                review
298  going into see seven pounds i was not clearly ...
929  i saw this movie in a theater while on vacatio...
1    a wonderful little production the filming tech...

First 3 values of y_train (target for training):
298    positive
929    positive
1      positive
Name: sentiment, dtype: object


# Bert-Tokenizer & important functions

In [46]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded.")

Tokenizer loaded.


In [47]:
def get_auxiliary_sentence(label):
    if label == 1:
        return "This review expresses positive sentiment."
    else:
        return "This review expresses negative sentiment."

# DataLoaders And Datasets for making data batches etc.

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoTokenizer # Assuming you'll use a Hugging Face tokenizer

# --- Assumed: imdb_data is already loaded and X_train, X_test, y_train, y_test are defined ---
# If you need to re-run the data loading and splitting for testing,
# make sure to use the correct 'imdb_data' DataFrame as established previously.

# Example of how imdb_data might look and how to split it if you're testing this block in isolation:
# data = {'review': ["This movie was great!", "Terrible film, avoid at all costs.", "Decent acting, weak plot."],
#         'sentiment': ["positive", "negative", "negative"]}
# imdb_data = pd.DataFrame(data)
#
# target_column = 'sentiment'
# X = imdb_data.drop(columns=[target_column])
# y = imdb_data[target_column]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)


# --- 1. Identify the text column for 'review' and convert sentiment labels ---
text_column = 'review' # Directly set to 'review' as per your dataset

train_texts = X_train[text_column].tolist()
test_texts = X_test[text_column].tolist()

# Convert sentiment labels to numerical format (e.g., positive: 1, negative: 0)
# This is crucial for PyTorch models.
# Assuming your sentiment is 'positive' and 'negative'. Adjust if different.
sentiment_mapping = {'positive': 1, 'negative': 0}
train_labels = [sentiment_mapping[label] for label in y_train.tolist()]
test_labels = [sentiment_mapping[label] for label in y_test.tolist()]

# --- 3. Define the IMDbBERT4TCDataset Class (Simplified for standard classification) ---
class IMDbBERT4TCDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # For standard sentiment classification, we just encode the text.
        # No need for get_auxiliary_sentence unless specifically required by your model architecture.
        encoded_input = self.tokenizer(
            text, # Only passing the text here
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded_input['input_ids'].squeeze(),
            'attention_mask': encoded_input['attention_mask'].squeeze(),
            'token_type_ids': encoded_input['token_type_ids'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- 4. Create Datasets and DataLoaders ---
train_dataset = IMDbBERT4TCDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
test_dataset = IMDbBERT4TCDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Datasets and DataLoaders created successfully using the provided train-test split.")
print(f"Number of samples in training dataset: {len(train_dataset)}")
print(f"Number of samples in test dataset: {len(test_dataset)}")

# Optional: Verify a batch
# for batch in train_dataloader:
#     print("\nSample batch structure:")
#     print(f"Input IDs shape: {batch['input_ids'].shape}")
#     print(f"Attention Mask shape: {batch['attention_mask'].shape}")
#     print(f"Token Type IDs shape: {batch['token_type_ids'].shape}")
#     print(f"Labels shape: {batch['labels'].shape}")
#     break

Datasets and DataLoaders created successfully using the provided train-test split.
Number of samples in training dataset: 800
Number of samples in test dataset: 200


In [49]:
class BERT4TCModel(nn.Module):
    def __init__(self, model_name, num_labels=2):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )
        return outputs

model = BERT4TCModel(MODEL_NAME, num_labels=2).to(device)
print("BERT4TC Model conceptual class defined and instantiated.")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT4TC Model conceptual class defined and instantiated.


# Optimizer And Scheduler Setup, Learning Rate Params

In [50]:
import re
# --- Optimizer and Scheduler setup (REVISED for robust layer grouping) ---
no_decay = ['bias', 'LayerNorm.weight']

# Create separate lists for each parameter group
embeddings_params = []
encoder_layer_params_lower = [] # Layers 0-5
encoder_layer_params_upper = [] # Layers 6-11
pooler_classifier_params = []
no_decay_params = [] # Parameters for which weight_decay is 0.0

# Regex to find layer number, e.g., "layer.X."
layer_pattern = re.compile(r'bert\.encoder\.layer\.(\d+)\.')

for n, p in model.named_parameters():
    if any(nd in n for nd in no_decay):
        no_decay_params.append(p)
    else:
        # Check for embeddings
        if 'bert.embeddings' in n:
            embeddings_params.append(p)
        # Check for encoder layers
        elif 'bert.encoder.layer.' in n: # Using in for initial filter, then regex for precise parsing
            match = layer_pattern.search(n)
            if match:
                layer_idx = int(match.group(1)) # Extract the number following 'layer.'
                if 0 <= layer_idx < 6:
                    encoder_layer_params_lower.append(p)
                elif 6 <= layer_idx < 12: # BERT-base has 12 layers (0-11)
                    encoder_layer_params_upper.append(p)
                # else: print(f"Warning: Layer index out of expected range for BERT-base: {layer_idx} in {n}")
            # else:
            #     print(f"Warning: 'bert.encoder.layer.' found but no layer number parsed for {n}")
        # Check for pooler and classifier
        elif 'bert.pooler' in n or 'classifier' in n:
            pooler_classifier_params.append(p)
        # else:
        #     # For debugging any parameters not categorized
        #     # print(f"Parameter not categorized: {n}")
        #     pass


optimizer_grouped_parameters = [
    {
        'params': embeddings_params,
        'lr': LEARNING_RATE * 0.1, # Smallest LR for embeddings
        'weight_decay': 0.01
    },
    {
        'params': encoder_layer_params_lower, # Layers 0-5
        'lr': LEARNING_RATE * 0.3, # Mid-low LR
        'weight_decay': 0.01
    },
    {
        'params': encoder_layer_params_upper, # Layers 6-11
        'lr': LEARNING_RATE * 0.6, # Mid-high LR
        'weight_decay': 0.01
    },
    {
        'params': pooler_classifier_params, # Pooler and classifier
        'lr': LEARNING_RATE, # Highest LR
        'weight_decay': 0.01
    },
    {
        'params': no_decay_params, # Bias and LayerNorm weights
        'lr': 0.0, # Or a very small LR if preferred, but 0.0 is common with weight_decay=0
        'weight_decay': 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters)
print("Optimizer with discriminative learning rates configured.")

Optimizer with discriminative learning rates configured.


In [51]:
total_steps = len(train_dataloader) * NUM_EPOCHS
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
print("Learning rate scheduler configured.")

Learning rate scheduler configured.


# Important Functions for Gradual Unfreezing

In [52]:
# --- Gradual Unfreezing helper functions ---
def freeze_all_bert_layers(model):
    for name, param in model.named_parameters():
        if "bert" in name:
            param.requires_grad = False
    for name, param in model.named_parameters():
        if "classifier" in name or "pooler" in name:
            param.requires_grad = True

def unfreeze_bert_layer_block(model, layer_index):
    for name, param in model.named_parameters():
        if f"bert.encoder.layer.{layer_index}." in name:
            param.requires_grad = True
            print(f"Unfrozen layer: {name}")

def unfreeze_bert_embeddings(model):
    for name, param in model.named_parameters():
        if "bert.embeddings" in name:
            param.requires_grad = True
            print(f"Unfrozen embeddings: {name}")

freeze_all_bert_layers(model)
print("All BERT layers initially frozen (classifier/pooler trainable).")

All BERT layers initially frozen (classifier/pooler trainable).


In [55]:
!pip3 install mlflow

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting cachetools<7,>=5.0.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading cachetools-6.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==3.1.0->mlflow)
  Downloading fastapi-0.115.13-py3-none-any.whl.metadata (27 kB)
Coll

# BertFinetuning on our standard way without auxiliary Sentences  -  1

In [None]:
import mlflow
import os # Make sure os is imported

# Define your total epochs
NUM_FREEZE_EPOCHS = 1
NUM_GRADUAL_UNFREEZE_EPOCHS = 1
TOTAL_EPOCHS = NUM_FREEZE_EPOCHS + NUM_GRADUAL_UNFREEZE_EPOCHS + 1


# Calculate total steps for the scheduler using the actual TOTAL_EPOCHS
total_steps = len(train_dataloader) * TOTAL_EPOCHS
warmup_steps = int(0.1 * total_steps)

# Initialize Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
print("Learning rate scheduler configured.")


# --- Gradual Unfreezing Helper Functions (as defined previously) ---
def freeze_all_bert_layers(model):
    """Freezes all parameters within the BERT encoder layers."""
    for name, param in model.named_parameters():
        if "bert." in name: # Targets parameters belonging to the BERT base model
            param.requires_grad = False
    print("All BERT encoder layers frozen.")

def unfreeze_bert_layer_block(model, layer_num):
    """Unfreezes a specific BERT encoder layer block."""
    layer_prefix = f"bert.encoder.layer.{layer_num}."
    for name, param in model.named_parameters():
        if layer_prefix in name:
            param.requires_grad = True
    print(f"Unfrozen BERT layer {layer_num}.")

def unfreeze_bert_embeddings(model):
    """Unfreezes BERT's embedding layer."""
    for name, param in model.named_parameters():
        if "bert.embeddings." in name:
            param.requires_grad = True
    print("Unfrozen BERT embeddings.")

# --- Initial Freezing (Before the loop starts) ---
freeze_all_bert_layers(model)

# --- MLflow Setup ---
# Set the MLflow tracking URI (if not set by MLFLOW_TRACKING_URI environment variable)
# By default, it's 'mlruns' in your current directory or can be a remote server.
mlflow.set_tracking_uri("http://mlflow:5000") # CORRECTED LINE: Removed duplicate "http://"

# Ensure the directory for saving models exists
model_save_dir = "mlflow_models"
os.makedirs(model_save_dir, exist_ok=True)
model_save_path = os.path.join(model_save_dir, 'best_sentiment_model_final.pth')


# --- Start MLflow Run ---
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("num_freeze_epochs", NUM_FREEZE_EPOCHS)
    mlflow.log_param("num_gradual_unfreeze_epochs", NUM_GRADUAL_UNFREEZE_EPOCHS)
    mlflow.log_param("total_epochs", TOTAL_EPOCHS)
    mlflow.log_param("warmup_steps_ratio", 0.1) # Log the ratio used for warmup steps
    mlflow.log_param("initial_learning_rate", optimizer.param_groups[0]['lr']) # Log initial LR

    best_accuracy = 0.0

    # --- Training Loop without Checkpointing ---
    for epoch in range(TOTAL_EPOCHS):
        print(f"\n--- Epoch {epoch+1}/{TOTAL_EPOCHS} ---")
        model.train()
        total_train_loss = 0

        # Gradual Unfreezing Logic
        if epoch == 0:
            pass
        elif epoch == NUM_FREEZE_EPOCHS:
            unfreeze_bert_layer_block(model, 11)
        elif epoch == NUM_FREEZE_EPOCHS + 1:
            unfreeze_bert_layer_block(model, 10)
        elif epoch == NUM_FREEZE_EPOCHS + 2:
            unfreeze_bert_layer_block(model, 9)
            unfreeze_bert_embeddings(model)
            print("All specified BERT layers and embeddings unfrozen for subsequent epochs.")

        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step() # This will now be defined

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        mlflow.log_metric("avg_train_loss", avg_train_loss, step=epoch)

        # --- Evaluation Loop ---
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        for batch in tqdm(test_dataloader, desc=f"Evaluating Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

            loss = outputs.loss
            logits = outputs.logits
            total_eval_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            total_eval_accuracy += (preds == labels).sum().item()

        avg_eval_loss = total_eval_loss / len(test_dataloader)
        avg_eval_accuracy = total_eval_accuracy / len(test_dataset)
        print(f"Average Validation Loss: {avg_eval_loss:.4f}")
        print(f"Validation Accuracy: {avg_eval_accuracy:.4f}")

        mlflow.log_metric("avg_eval_loss", avg_eval_loss, step=epoch)
        mlflow.log_metric("avg_eval_accuracy", avg_eval_accuracy, step=epoch)

        # --- Save the best model ---
        if avg_eval_accuracy > best_accuracy:
            best_accuracy = avg_eval_accuracy
            torch.save(model.state_dict(), model_save_path)
            print(f"New best model saved with accuracy: {best_accuracy:.4f} at {model_save_path}")
            # Log the best model as an artifact
            mlflow.log_artifact(model_save_path, "best_model")
            mlflow.set_tag("best_accuracy", f"{best_accuracy:.4f}") # Set a tag for easy filtering
        else:
            print(f"Validation accuracy did not improve. Best so far: {best_accuracy:.4f}")

    print("\nTraining complete.")
    print(f"Final best validation accuracy: {best_accuracy:.4f}")
    print(f"Best model saved at: {model_save_path}")

    # Log the final best accuracy as a metric at the end of the run
    mlflow.log_metric("final_best_accuracy", best_accuracy)

print("\nMLflow run completed.")

Learning rate scheduler configured.
All BERT encoder layers frozen.

--- Epoch 1/3 ---


Training Epoch 1:   0%|          | 0/50 [00:00<?, ?it/s]

Average Training Loss: 0.7030


Evaluating Epoch 1:   0%|          | 0/13 [00:00<?, ?it/s]

Average Validation Loss: 0.6976
Validation Accuracy: 0.5000
New best model saved with accuracy: 0.5000 at mlflow_models/best_sentiment_model_final.pth

--- Epoch 2/3 ---
Unfrozen BERT layer 11.


Training Epoch 2:   0%|          | 0/50 [00:00<?, ?it/s]

Average Training Loss: 0.7036


Evaluating Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Average Validation Loss: 0.6976
Validation Accuracy: 0.5000
Validation accuracy did not improve. Best so far: 0.5000

--- Epoch 3/3 ---
Unfrozen BERT layer 10.


Training Epoch 3:   0%|          | 0/50 [00:00<?, ?it/s]

Average Training Loss: 0.7022


Evaluating Epoch 3:   0%|          | 0/13 [00:00<?, ?it/s]

Average Validation Loss: 0.6976
Validation Accuracy: 0.5000
Validation accuracy did not improve. Best so far: 0.5000

Training complete.
Final best validation accuracy: 0.5000
Best model saved at: mlflow_models/best_sentiment_model_final.pth
🏃 View run magnificent-lark-241 at: http://mlflow:5000/#/experiments/0/runs/7a15d36666c443b2a703a0d907029111
🧪 View experiment at: http://mlflow:5000/#/experiments/0

MLflow run completed.


# BertFinetuning on our standard way without auxiliary Sentences  -  2

In [68]:
MAX_LENGTH = 256 # Try a larger value if your reviews are long
truncation=True

class IMDbBERT4TCDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Convert numerical label back to its original string form for auxiliary sentence generation
        # This assumes your sentiment_mapping is accessible, or you pass the string label directly
        # For simplicity here, we'll assume the original labels were 'positive' and 'negative'
        original_label_str = 'positive' if label == 1 else 'negative' # If label is 0 or 1
        auxiliary_sentence = get_auxiliary_sentence(label) # Pass numerical label to get aux sentence based on paper's idea

        # KEY CHANGE: Pass text and auxiliary_sentence as a pair to the tokenizer
        # This will create [CLS] review_text [SEP] auxiliary_sentence [SEP]
        encoded_input = self.tokenizer(
            text,
            auxiliary_sentence, # The second sequence
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded_input['input_ids'].squeeze(),
            'attention_mask': encoded_input['attention_mask'].squeeze(),
            'token_type_ids': encoded_input['token_type_ids'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- Create Datasets and DataLoaders ---
train_texts = X_train['review'].tolist()
test_texts = X_test['review'].tolist()

sentiment_mapping = {'positive': 1, 'negative': 0}
train_labels = [sentiment_mapping[label] for label in y_train.tolist()]
test_labels = [sentiment_mapping[label] for label in y_test.tolist()]


train_dataset = IMDbBERT4TCDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
test_dataset = IMDbBERT4TCDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("\nDatasets and DataLoaders created successfully with auxiliary sentence input. ✅")
print(f"Number of samples in training dataset: {len(train_dataset)}")
print(f"Number of samples in test dataset: {len(test_dataset)}")

# Verify a batch's token_type_ids to see the effect of sentence pairing
print("\nVerifying token_type_ids for a sample batch:")
for batch in train_dataloader:
    print(f"Input IDs shape: {batch['input_ids'].shape}")
    print(f"Attention Mask shape: {batch['attention_mask'].shape}")
    print(f"Token Type IDs shape (should show 0s and 1s): {batch['token_type_ids'].shape}")
    # Example to show the token_type_ids for the first item in the batch
    print("First item's token_type_ids (should see 0s then 1s):")
    print(batch['token_type_ids'][0])
    # Decode to see the actual text and auxiliary sentence
    print("Decoded first item:")
    decoded_text = tokenizer.decode(batch['input_ids'][0], skip_special_tokens=False)
    print(decoded_text)
    break

# --- Model Definition (as you had it) ---
class BERT4TCModel(nn.Module):
    def __init__(self, model_name, num_labels=2):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids, # This will now be meaningful (0s for review, 1s for aux sentence)
            labels=labels
        )
        return outputs

# Ensure 'device' is defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERT4TCModel(MODEL_NAME, num_labels=2).to(device)
print("\nBERT4TC Model instantiated (ready to use sentence-pair input).")

# --- Optimizer and Scheduler setup (as you had it, no changes needed for this part) ---
no_decay = ['bias', 'LayerNorm.weight']

embeddings_params = []
encoder_layer_params_lower = []
encoder_layer_params_upper = []
pooler_classifier_params = []
no_decay_params = []

layer_pattern = re.compile(r'bert\.encoder\.layer\.(\d+)\.')

for n, p in model.named_parameters():
    if any(nd in n for nd in no_decay):
        no_decay_params.append(p)
    else:
        if 'bert.embeddings' in n:
            embeddings_params.append(p)
        elif 'bert.encoder.layer.' in n:
            match = layer_pattern.search(n)
            if match:
                layer_idx = int(match.group(1))
                if 0 <= layer_idx < 6:
                    encoder_layer_params_lower.append(p)
                elif 6 <= layer_idx < 12:
                    encoder_layer_params_upper.append(p)
        elif 'bert.pooler' in n or 'classifier' in n:
            pooler_classifier_params.append(p)

optimizer_grouped_parameters = [
    {
        'params': embeddings_params,
        'lr': LEARNING_RATE * 0.1,
        'weight_decay': 0.01
    },
    {
        'params': encoder_layer_params_lower,
        'lr': LEARNING_RATE * 0.3,
        'weight_decay': 0.01
    },
    {
        'params': encoder_layer_params_upper,
        'lr': LEARNING_RATE * 0.6,
        'weight_decay': 0.01
    },
    {
        'params': pooler_classifier_params,
        'lr': LEARNING_RATE,
        'weight_decay': 0.01
    },
    {
        'params': no_decay_params,
        'lr': 0.0,
        'weight_decay': 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters)
print("Optimizer with discriminative learning rates configured.")


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Datasets and DataLoaders created successfully with auxiliary sentence input. ✅
Number of samples in training dataset: 800
Number of samples in test dataset: 200

Verifying token_type_ids for a sample batch:
Input IDs shape: torch.Size([16, 256])
Attention Mask shape: torch.Size([16, 256])
Token Type IDs shape (should show 0s and 1s): torch.Size([16, 256])
First item's token_type_ids (should see 0s then 1s):
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERT4TC Model instantiated (ready to use sentence-pair input).
Optimizer with discriminative learning rates configured.


# Bert4TC Finetuning according to Paper 

In [None]:
import mlflow
import os # Make sure os is imported

# Define your total epochs
NUM_FREEZE_EPOCHS = 1
NUM_GRADUAL_UNFREEZE_EPOCHS = 0
TOTAL_EPOCHS = NUM_FREEZE_EPOCHS + NUM_GRADUAL_UNFREEZE_EPOCHS + 0

# Calculate total steps for the scheduler using the actual TOTAL_EPOCHS
total_steps = len(train_dataloader) * TOTAL_EPOCHS
warmup_steps = int(0.1 * total_steps)

# Initialize Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
print("Learning rate scheduler configured.")


# --- Gradual Unfreezing Helper Functions (as defined previously) ---
def freeze_all_bert_layers(model):
    """Freezes all parameters within the BERT encoder layers."""
    for name, param in model.named_parameters():
        if "bert." in name: # Targets parameters belonging to the BERT base model
            param.requires_grad = False
    print("All BERT encoder layers frozen.")

def unfreeze_bert_layer_block(model, layer_num):
    """Unfreezes a specific BERT encoder layer block."""
    layer_prefix = f"bert.encoder.layer.{layer_num}."
    for name, param in model.named_parameters():
        if layer_prefix in name:
            param.requires_grad = True
    print(f"Unfrozen BERT layer {layer_num}.")

def unfreeze_bert_embeddings(model):
    """Unfreezes BERT's embedding layer."""
    for name, param in model.named_parameters():
        if "bert.embeddings." in name:
            param.requires_grad = True
    print("Unfrozen BERT embeddings.")

# --- Initial Freezing (Before the loop starts) ---
freeze_all_bert_layers(model)

# --- MLflow Setup ---
# Set the MLflow tracking URI (if not set by MLFLOW_TRACKING_URI environment variable)
# By default, it's 'mlruns' in your current directory or can be a remote server.
mlflow.set_tracking_uri("http://mlflow:5000") # CORRECTED LINE: Removed duplicate "http://"

# Ensure the directory for saving models exists
model_save_dir = "mlflow_models"
os.makedirs(model_save_dir, exist_ok=True)
model_save_path = os.path.join(model_save_dir, 'best_sentiment_model_final.pth')


# --- Start MLflow Run ---
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("num_freeze_epochs", NUM_FREEZE_EPOCHS)
    mlflow.log_param("num_gradual_unfreeze_epochs", NUM_GRADUAL_UNFREEZE_EPOCHS)
    mlflow.log_param("total_epochs", TOTAL_EPOCHS)
    mlflow.log_param("warmup_steps_ratio", 0.1) # Log the ratio used for warmup steps
    mlflow.log_param("initial_learning_rate", optimizer.param_groups[0]['lr']) # Log initial LR

    best_accuracy = 0.0

    # --- Training Loop without Checkpointing ---
    for epoch in range(TOTAL_EPOCHS):
        print(f"\n--- Epoch {epoch+1}/{TOTAL_EPOCHS} ---")
        model.train()
        total_train_loss = 0

        # Gradual Unfreezing Logic
        if epoch == 0:
            pass
        elif epoch == NUM_FREEZE_EPOCHS:
            unfreeze_bert_layer_block(model, 11)
        elif epoch == NUM_FREEZE_EPOCHS + 1:
            unfreeze_bert_layer_block(model, 10)
        elif epoch == NUM_FREEZE_EPOCHS + 2:
            unfreeze_bert_layer_block(model, 9)
            unfreeze_bert_embeddings(model)
            print("All specified BERT layers and embeddings unfrozen for subsequent epochs.")

        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step() # This will now be defined

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        mlflow.log_metric("avg_train_loss", avg_train_loss, step=epoch)

        # --- Evaluation Loop ---
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        for batch in tqdm(test_dataloader, desc=f"Evaluating Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

            loss = outputs.loss
            logits = outputs.logits
            total_eval_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            total_eval_accuracy += (preds == labels).sum().item()

        avg_eval_loss = total_eval_loss / len(test_dataloader)
        avg_eval_accuracy = total_eval_accuracy / len(test_dataset)
        print(f"Average Validation Loss: {avg_eval_loss:.4f}")
        print(f"Validation Accuracy: {avg_eval_accuracy:.4f}")

        mlflow.log_metric("avg_eval_loss", avg_eval_loss, step=epoch)
        mlflow.log_metric("avg_eval_accuracy", avg_eval_accuracy, step=epoch)

        # --- Save the best model ---
        if avg_eval_accuracy > best_accuracy:
            best_accuracy = avg_eval_accuracy
            torch.save(model.state_dict(), model_save_path)
            print(f"New best model saved with accuracy: {best_accuracy:.4f} at {model_save_path}")
            # Log the best model as an artifact
            mlflow.log_artifact(model_save_path, "best_model")
            mlflow.set_tag("best_accuracy", f"{best_accuracy:.4f}") # Set a tag for easy filtering
        else:
            print(f"Validation accuracy did not improve. Best so far: {best_accuracy:.4f}")

    print("\nTraining complete.")
    print(f"Final best validation accuracy: {best_accuracy:.4f}")
    print(f"Best model saved at: {model_save_path}")

    # Log the final best accuracy as a metric at the end of the run
    mlflow.log_metric("final_best_accuracy", best_accuracy)

print("\nMLflow run completed.")

Learning rate scheduler configured.
All BERT encoder layers frozen.

--- Epoch 1/1 ---


Training Epoch 1:   0%|          | 0/50 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Average Training Loss: 0.8408


Evaluating Epoch 1:   0%|          | 0/13 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Average Validation Loss: 0.8652
Validation Accuracy: 0.5150
New best model saved with accuracy: 0.5150 at mlflow_models/best_sentiment_model_final.pth

Training complete.
Final best validation accuracy: 0.5150
Best model saved at: mlflow_models/best_sentiment_model_final.pth
🏃 View run nebulous-pig-609 at: http://mlflow:5000/#/experiments/0/runs/55b3577c2c194a1098e47085b7f8d984
🧪 View experiment at: http://mlflow:5000/#/experiments/0

MLflow run completed.


# Inference with Fine tuned bert Model

In [71]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F
import os
import re # For preprocessing functions

# --- Configuration (Must match training configuration) ---
MODEL_NAME = 'bert-base-uncased'
MAX_LENGTH = 256 # Make sure this matches your training MAX_LENGTH
NUM_LABELS = 2 # 0 for negative, 1 for positive
model_save_dir = "mlflow_models" # Directory where your best model is saved
model_path = os.path.join(model_save_dir, 'best_sentiment_model_final.pth')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Define your BERT4TCModel class (Exact copy from your training script) ---
class BERT4TCModel(torch.nn.Module):
    def __init__(self, model_name_or_path, num_labels=2):
        super().__init__()
        # If loading from a saved state_dict, you first need to load the base pre-trained model
        # and then load your fine-tuned weights onto it.
        self.bert = BertForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )
        return outputs

# --- Preprocessing functions (Exact copies from your data preparation) ---
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_url(text):
    return re.sub(r'http\S+|www.\S+', '', text)

def remove_contractions(text):
    # This is a very basic example; a full implementation would be much larger
    # You should use the exact same dictionary you used during training
    contractions = {
        "ain't": "am not", "aren't": "are not", "can't": "cannot",
        "can't've": "cannot have", "'cause": "because", "could've": "could have",
        "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
        "doesn't": "does not", "don't": "do not", "hadn't": "had not",
        "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not",
        "he'd": "he would", "he'd've": "he would have", "he'll": "he will",
        "he'll've": "he will have", "he's": "he is", "how'd": "how did",
        "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
        "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
        "I'll've": "I will have", "I'm": "I am", "I've": "I have",
        "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
        "it'll": "it will", "it'll've": "it will have", "it's": "it is",
        "let's": "let us", "ma'am": "madam", "mayn't": "may not",
        "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
        "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
        "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock",
        "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
        "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
        "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
        "she's": "she is", "should've": "should have", "shouldn't": "should not",
        "shouldn't've": "should not have", "so've": "so have", "so's": "so is",
        "that'd": "that would", "that'd've": "that would have", "that's": "that is",
        "there'd": "there would", "there'd've": "there would have", "there's": "there is",
        "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
        "they'll've": "they will have", "they're": "they are", "they've": "they have",
        "to've": "to have", "wasn't": "was not", "we'd": "we would",
        "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
        "we're": "we are", "we've": "we have", "weren't": "were not",
        "what'll": "what will", "what'll've": "what will have", "what're": "what are",
        "what's": "what is", "what've": "what have", "when's": "when is",
        "when've": "when have", "where'd": "where did", "where's": "where is",
        "where've": "where have", "who'll": "who will", "who'll've": "who will have",
        "who's": "who is", "who've": "who have", "why's": "why is",
        "why've": "why have", "will've": "will have", "won't": "will not",
        "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
        "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
        "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
        "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
        "you'll've": "you will have", "you're": "you are", "you've": "you have"
    }
    for contraction, expansion in contractions.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', expansion, text, flags=re.IGNORECASE)
    return text

def get_auxiliary_sentence_inference(sentiment_label_str):
    """
    Generates the auxiliary sentence based on the sentiment for BERT4TC inference.
    """
    if sentiment_label_str == "positive":
        return "This review expresses positive sentiment."
    elif sentiment_label_str == "negative":
        return "This review expresses negative sentiment."
    else:
        # For BERT4TC, you typically only have positive/negative auxiliary sentences.
        # If your model supports neutral, you'd add it here.
        # For IMDb, it's usually binary.
        raise ValueError("Invalid sentiment label string for auxiliary sentence generation.")

# --- Load the saved tokenizer and model ---
print(f"Loading tokenizer from {MODEL_NAME} and model from {model_path}...")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
loaded_model = BERT4TCModel(MODEL_NAME, num_labels=NUM_LABELS).to(device)

# Load the state_dict into your model
if os.path.exists(model_path):
    loaded_model.load_state_dict(torch.load(model_path, map_location=device))
    print("Model weights loaded successfully.")
else:
    print(f"Error: Model weights not found at {model_path}. Please ensure training was completed and model saved.")
    exit() # Exit if model weights are not found

loaded_model.eval() # Set the model to evaluation mode
print("Model loaded and set to evaluation mode. Ready for inference.")


# --- Inference Function ---
def predict_sentiment(review_text, model, tokenizer, max_length, device):
    """
    Predicts the sentiment of a movie review using the fine-tuned BERT4TC model.
    It applies the BERT4TC inference strategy by comparing probabilities
    from positive and negative auxiliary sentences.
    """
    model.eval() # Ensure model is in evaluation mode

    # Preprocess the input review text just like during training
    processed_text = review_text.lower()
    processed_text = remove_html_tags(processed_text)
    processed_text = remove_url(processed_text)
    processed_text = remove_contractions(processed_text)

    # --- BERT4TC Inference Strategy ---
    # We will get predictions for the review paired with a "positive" auxiliary sentence
    # and separately for the review paired with a "negative" auxiliary sentence.
    # The final prediction is based on which pairing yields a higher probability
    # for the corresponding sentiment label.

    # 1. Evaluate with "positive" auxiliary sentence
    aux_pos = get_auxiliary_sentence_inference("positive")
    encoded_pos = tokenizer(
        processed_text,
        aux_pos, # Second sequence is the auxiliary sentence
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs_pos = model(
            input_ids=encoded_pos['input_ids'],
            attention_mask=encoded_pos['attention_mask'],
            token_type_ids=encoded_pos['token_type_ids']
        )
        logits_pos = outputs_pos.logits
        # Get probability for the 'positive' class (label 1) when paired with 'positive' aux
        # Assuming label 1 maps to positive
        prob_review_is_positive_given_positive_aux = F.softmax(logits_pos, dim=1)[0][1].item()

    # 2. Evaluate with "negative" auxiliary sentence
    aux_neg = get_auxiliary_sentence_inference("negative")
    encoded_neg = tokenizer(
        processed_text,
        aux_neg, # Second sequence is the auxiliary sentence
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs_neg = model(
            input_ids=encoded_neg['input_ids'],
            attention_mask=encoded_neg['attention_mask'],
            token_type_ids=encoded_neg['token_type_ids']
        )
        logits_neg = outputs_neg.logits
        # Get probability for the 'negative' class (label 0) when paired with 'negative' aux
        # Assuming label 0 maps to negative
        prob_review_is_negative_given_negative_aux = F.softmax(logits_neg, dim=1)[0][0].item()

    # 3. Compare the probabilities to make the final decision
    if prob_review_is_positive_given_positive_aux > prob_review_is_negative_given_negative_aux:
        return "positive", prob_review_is_positive_given_positive_aux
    else:
        return "negative", prob_review_is_negative_given_negative_aux

# --- Test your model with example reviews ---
print("\n--- Performing Inference on Example Reviews ---")

review1 = "This movie was absolutely brilliant! A masterpiece from start to finish. I loved every single moment."
sentiment1, confidence1 = predict_sentiment(review1, loaded_model, tokenizer, MAX_LENGTH, device)
print(f"Review: '{review1}'")
print(f"Predicted Sentiment: {sentiment1} (Confidence: {confidence1:.4f})\n")

review2 = "Worst film I've seen all year. The plot made no sense, and the acting was terrible. Don't waste your money."
sentiment2, confidence2 = predict_sentiment(review2, loaded_model, tokenizer, MAX_LENGTH, device)
print(f"Review: '{review2}'")
print(f"Predicted Sentiment: {sentiment2} (Confidence: {confidence2:.4f})\n")

review3 = "It was okay, nothing special. I wouldn't watch it again but it wasn't awful."
sentiment3, confidence3 = predict_sentiment(review3, loaded_model, tokenizer, MAX_LENGTH, device)
print(f"Review: '{review3}'")
print(f"Predicted Sentiment: {sentiment3} (Confidence: {confidence3:.4f})\n")

review4 = "The movie had some good parts, but overall it was a bit disappointing. The ending felt rushed."
sentiment4, confidence4 = predict_sentiment(review4, loaded_model, tokenizer, MAX_LENGTH, device)
print(f"Review: '{review4}'")
print(f"Predicted Sentiment: {sentiment4} (Confidence: {confidence4:.4f})\n")

Loading tokenizer from bert-base-uncased and model from mlflow_models/best_sentiment_model_final.pth...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model weights loaded successfully.
Model loaded and set to evaluation mode. Ready for inference.

--- Performing Inference on Example Reviews ---
Review: 'This movie was absolutely brilliant! A masterpiece from start to finish. I loved every single moment.'
Predicted Sentiment: negative (Confidence: 0.7372)

Review: 'Worst film I've seen all year. The plot made no sense, and the acting was terrible. Don't waste your money.'
Predicted Sentiment: negative (Confidence: 0.8038)

Review: 'It was okay, nothing special. I wouldn't watch it again but it wasn't awful.'
Predicted Sentiment: negative (Confidence: 0.7756)

Review: 'The movie had some good parts, but overall it was a bit disappointing. The ending felt rushed.'
Predicted Sentiment: negative (Confidence: 0.8118)



# Assignment 3
My groupmate Iqra Rathore (SP25-Rai-018) and I, Sumair Javed (SP25-Rai-019), worked on a project to teach a smart computer program called BERT how to figure out if movie reviews from the IMDb website are good (positive) or bad (negative). We did this by fine-tuning BERT. The fine-tuning techniques employed were standard fine-tuning and the BERT4TC method with gradual unfreezing.

This project took a lot of time because we faced a big problem early on: our first IMDb dataset was wrong. This made our program overfit, meaning it became too good at remembering our training examples but couldn't guess correctly on new reviews. It was tough to find the right dataset, and fixing this took quite a while before we could even properly train the program.

We tried two main ways to fine-tune BERT:

## Our Regular Way:
First, we cleaned up the movie reviews by removing things like internet links and fixing shortened words. Then, we just fine-tuned BERT directly on these clean reviews.

## The Paper's Way (BERT4TC):
We also tried a more advanced method from a research paper. This involved adding a special "helper sentence" to each movie review. For example, if a review was positive, we'd add, "This review shows good feelings." Then, we fine-tuned BERT using both the review and this helper sentence together. This helped BERT understand the sentiment better.

However, training the Paper's Way was very slow. Even one round of training (called an epoch) took about two hours. Because we could only run it for one epoch, our model either didn't learn enough or it became too focused on guessing "negative" every time. This showed us that training for too short a time meant the model wasn't properly fine-tuned.

For both methods, we made sure to prepare the data carefully so BERT could learn as well as possible, despite the early challenges and the long training times.