In [None]:
import pandas as pd
import numpy as np # pandas uses numpy, so it's good to import

print("Load and Clean Data")

# Define the file path
file_path = 'TRAIN_RELEASE_3SEP2025/train_subtask2a.csv'

try:
    # 1. Load the dataset
    df = pd.read_csv(file_path)
    print(f"Successfully loaded '{file_path}'")
    print(f"Original number of rows: {len(df)}")

    # 2. Check for missing "answers" (our target columns)
    valence_nulls = df['state_change_valence'].isnull().sum()
    print(f"Rows with missing 'state_change_valence': {valence_nulls}")
    
    # 3. Remove rows where *either* of the target columns is null
    # We create a new, clean DataFrame
    df_clean = df.dropna(subset=['state_change_valence', 'state_change_arousal'])

    print(f"Number of rows after removing missing answers: {len(df_clean)}")

    # Let's look at the first 5 rows of our new clean data
    print("\n--- First 5 rows of CLEANED data ---")
    print(df_clean.head())

    # IMPORTANT: From now on, we will only use the 'df_clean' DataFrame.

except FileNotFoundError:
    print(f"ERROR: The file '{file_path}' was not found.")
    print("Please make sure 'train_subtask2a.csv' is in the same folder as your Python script.")
except Exception as e:
    print(f"An error occurred: {e}")

--- Step 1: Load and Clean Data ---
Successfully loaded 'TRAIN_RELEASE_3SEP2025/train_subtask2a.csv'
Original number of rows: 2764
Rows with missing 'state_change_valence': 137
Number of rows after removing missing answers: 2627

--- First 5 rows of CLEANED data ---
   user_id  text_id                                               text  \
0        1      200  I feel good .   I caught up on some sleep . Wo...   
1        1      201  I’ve been feeling good for days and days . I r...   
2        1      202  I’ve been feeling fine personally . I’ve been ...   
3        1      203  I feel great . I’ve had a day off . I’m going ...   
5        2       23       Productive , Tired , Active , Pleased , Busy   

             timestamp  collection_phase  is_words  valence  arousal  \
0  2021-06-09 12:41:57                 1     False      2.0      1.0   
1  2021-06-11 12:01:45                 1     False      2.0      1.0   
2  2021-06-13 13:15:07                 1     False      0.0      1.0   


In [None]:
import re # Import the regular expressions library
import warnings

# Suppress that annoying SettingWithCopyWarning
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

print("Cleaning the Text")

# This 'try' block will only run if 'df_clean' exists from Step 1
try:
    
    # Define a function to clean our text
    def clean_text(text):
        # 1. Make the text lowercase
        text = text.lower()
        
        # 2. Remove anything that is not a-z or a space
        # [^a-z\s] means "match anything that is NOT (^) a-z or whitespace (\s)"
        text = re.sub(r'[^a-z\s]', '', text)
        
        # 3. Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    # --- Now, apply this function to our entire 'text' column ---
    # We use .loc to create the new column safely without a warning
    print("Applying cleaning to all rows...")
    df_clean.loc[:, 'text_clean'] = df_clean['text'].apply(clean_text)
    
    # --- Let's look at the result ---
    print("--- 'text' vs 'text_clean' in our DataFrame ---")
    print(df_clean[['text', 'text_clean']].head())
    print("\nStep 2 Complete. We now have a 'text_clean' column.")


except NameError:
    print("\nERROR: The 'df_clean' DataFrame was not found.")
    print("Please make sure you ran Baby Step 1 successfully.")
except Exception as e:
    print(f"An error occurred: {e}")


--- Baby Step 2: Cleaning the Text ---
Applying cleaning to all rows...
--- 'text' vs 'text_clean' in our DataFrame ---
                                                text  \
0  I feel good .   I caught up on some sleep . Wo...   
1  I’ve been feeling good for days and days . I r...   
2  I’ve been feeling fine personally . I’ve been ...   
3  I feel great . I’ve had a day off . I’m going ...   
5       Productive , Tired , Active , Pleased , Busy   

                                          text_clean  
0  i feel good i caught up on some sleep work wen...  
1  ive been feeling good for days and days i real...  
2  ive been feeling fine personally ive been tryi...  
3  i feel great ive had a day off im going to go ...  
5               productive tired active pleased busy  

Step 2 Complete. We now have a 'text_clean' column.


In [None]:
from sklearn.model_selection import train_test_split # Import the splitter

print("Splitting into Train/Validation")

# This 'try' block will only run if 'df_clean' exists
try:
    
    # 1. Define our "X" (inputs) and "y" (answers)
    
    # X (input) is JUST the clean text
    X = df_clean['text_clean']
    
    # y (answer) is our two 'state_change' columns
    y = df_clean[['state_change_valence', 'state_change_arousal']]
    
    
    # 2. Split the data!
    X_train, X_val, y_train, y_val = train_test_split(
        X, 
        y, 
        test_size=0.2, 
        random_state=42 # for reproducible results
    )
    
    
    # 3. --- CRITICAL FIX: Reset the Indexes ---
    # This ensures our indexes go 0, 1, 2... which is
    # required for the PyTorch Dataset in the next steps.
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    y_val = y_val.reset_index(drop=True)
    
    
    # 4. Let's see the results
    print(f"Total rows in our clean data:    {len(df_clean)}")
    print(f"Rows in our new 'Training' set:  {len(X_train)}")
    print(f"Rows in our new 'Validation' set: {len(X_val)}")
    
    print("\n--- Example of a 'Training' text (index 0) ---")
    print(X_train.iloc[0])
    
    print("\n--- Example of a 'Training' answer (index 0) ---")
    print(y_train.iloc[0])
    
    print("\nStep 3 Complete. We now have X_train, y_train, X_val, y_val.")

except NameError:
    print("\nERROR: The 'df_clean' DataFrame was not found.")
    print("Please make sure you ran Baby Step 1 & 2 successfully.")
except Exception as e:
    print(f"An error occurred: {e}")


--- Baby Step 3: Splitting into Train/Validation ---
Total rows in our clean data:    2627
Rows in our new 'Training' set:  2101
Rows in our new 'Validation' set: 526

--- Example of a 'Training' text (index 0) ---
i am currently feeling tired i woke up very early for my shift at the hospital and did not have a good night s sleep i am looking forward to see my family tonight in order to celebrate my birthday i am happy they live nearby and will be able to visit

--- Example of a 'Training' answer (index 0) ---
state_change_valence    0.0
state_change_arousal    0.0
Name: 0, dtype: float64

Step 3 Complete. We now have X_train, y_train, X_val, y_val.


In [None]:
from transformers import AutoTokenizer

print("Load the Tokenizer")

# This is our "Champion" model
# We define it here so all the next steps can use it
MODEL_NAME = 'distilbert-base-uncased'

try:
    # 1. Load the matching tokenizer
    print(f"Loading tokenizer for '{MODEL_NAME}'...")
    
    # use_safetensors=True is good practice for security and speed
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_safetensors=True)
    
    print("Tokenizer loaded successfully.")
    
    # 2. --- Let's test the tokenizer ---
    test_text = "i feel great today" # A simple, clean example
    
    print(f"\n--- Testing tokenizer on: '{test_text}' ---")
    
    # This shows what the tokenizer does:
    # It chops the text into "tokens" (word-pieces)
    # And converts them to "input_ids" (numbers)
    tokenized_output = tokenizer(test_text)
    
    print(f"Text: {test_text}")
    print(f"Tokens: {tokenizer.tokenize(test_text)}")
    print(f"Input IDs: {tokenized_output['input_ids']}")
    
    print("\nStep 4 (Advanced 1) Complete. We now have our 'tokenizer'.")

except ImportError:
    print("\nERROR: 'transformers' library not found.")
    print("Please make sure you have it installed: pip install transformers")
except Exception as e:
    print(f"An error occurred: {e}")


--- Advanced Step 1: Load the Tokenizer ---
Loading tokenizer for 'distilbert-base-uncased'...
Tokenizer loaded successfully.

--- Testing tokenizer on: 'i feel great today' ---
Text: i feel great today
Tokens: ['i', 'feel', 'great', 'today']
Input IDs: [101, 1045, 2514, 2307, 2651, 102]

Step 4 (Advanced 1) Complete. We now have our 'tokenizer'.


In [None]:
import torch
from torch.utils.data import Dataset

print("Creating the *Simple* PyTorch Dataset")

# This 'try' block will only run if our previous steps exist
try:
    
    # 1. DEFINE OUR SIMPLE DATASET CLASS
    
    class EmotionDataset(Dataset):
        # This dataset only loads text and labels. Nice and simple.
        def __init__(self, texts, labels, tokenizer, max_len=128):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_len = max_len

        # This tells PyTorch how many items are in the dataset
        def __len__(self):
            return len(self.texts)

        # This tells PyTorch *how to get* a single item
        def __getitem__(self, idx):
            
            # 1. Get the text at the index (e.g., index 0)
            # We use .iloc[idx] because we reset our indexes in Step 3
            text = self.texts.iloc[idx]
            
            # 2. Get the label row at the index (e.g., index 0)
            # .values turns the DataFrame row into a numpy array [val, ars]
            label = self.labels.iloc[idx].values 
            
            # 3. Tokenize the text
            encoding = self.tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=self.max_len,
                return_tensors='pt'
            )
            
            # 4. Return our simple dictionary
            return {
                # .squeeze() removes extra dimensions (e.g., [1, 128] -> [128])
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(label, dtype=torch.float32)
            }

    
    # 2. CREATE THE DATASET OBJECTS!
    # We use the X_train, y_train, etc. from Step 3
    
    print("Creating the training dataset...")
    train_dataset = EmotionDataset(
        texts=X_train,
        labels=y_train,
        tokenizer=tokenizer
    )
    
    print("Creating the validation dataset...")
    val_dataset = EmotionDataset(
        texts=X_val,
        labels=y_val,
        tokenizer=tokenizer
    )
    
    print("Datasets created successfully.")
    
    # 3. --- Let's test it! ---
    print("\n--- Grabbing one item (index 0) from train_dataset ---")
    item = train_dataset[0]
    
    print("Item's dictionary keys:")
    print(item.keys())
    print(f"\nShape of 'input_ids': {item['input_ids'].shape}")
    print(f"Shape of 'labels':    {item['labels'].shape}")
    
    print("\nStep 5 (Advanced 2) Complete. We have 'train_dataset' and 'val_dataset'.")


except NameError as e:
    print(f"\nERROR: A variable was not found. Did you run all previous steps? {e}")
except Exception as e:
    print(f"An error occurred: {e}")


--- Advanced Step 2: Creating the *Simple* PyTorch Dataset ---
Creating the training dataset...
Creating the validation dataset...
Datasets created successfully.

--- Grabbing one item (index 0) from train_dataset ---
Item's dictionary keys:
dict_keys(['input_ids', 'attention_mask', 'labels'])

Shape of 'input_ids': torch.Size([128])
Shape of 'labels':    torch.Size([2])

Step 5 (Advanced 2) Complete. We have 'train_dataset' and 'val_dataset'.


In [None]:
from torch.utils.data import DataLoader

print("Creating the DataLoaders")

# This 'try' block will only run if our datasets exist
try:
    
    # 1. DEFINE A BATCH SIZE
    # This is a "hyperparameter" you can tune.
    # 16 is a very common and safe batch size.
    # It means "feed the model 16 items at a time."
    BATCH_SIZE = 16
    
    
    # 2. CREATE THE TRAIN DATALOADER
    print(f"Creating train_loader with batch_size={BATCH_SIZE} and shuffling...")
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True  # Shuffle the training data!
    )
    
    
    # 3. CREATE THE VALIDATION DATALOADER
    print(f"Creating val_loader with batch_size={BATCH_SIZE}...")
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False # No need to shuffle validation data
    )
    
    
    print("\nDataLoaders are created!")
    
    
    # 4. --- Let's test it! ---
    print("\n--- Grabbing one 'batch' from the train_loader ---")
    
    # 'iter' creates an iterator, 'next' grabs the first item
    data_batch = next(iter(train_loader))
    
    # Let's check the shapes. Now they have a "batch" dimension!
    print(f"Shape of 'input_ids' batch: {data_batch['input_ids'].shape}")
    print(f"Shape of 'attention_mask' batch: {data_batch['attention_mask'].shape}")
    print(f"Shape of 'labels' batch:    {data_batch['labels'].shape}")
    
    print(f"\nThis means the batch has {data_batch['input_ids'].shape[0]} items (our batch size),")
    print(f"and each item has {data_batch['input_ids'].shape[1]} token IDs.")
    
    print("\nStep 6 (Advanced 3) Complete. We have 'train_loader' and 'val_loader'.")
    

except NameError as e:
    print(f"\nERROR: A variable was not found. Did you run all previous steps? {e}")
except Exception as e:
    print(f"An error occurred: {e}")


--- Advanced Step 3: Creating the DataLoaders ---
Creating train_loader with batch_size=16 and shuffling...
Creating val_loader with batch_size=16...

DataLoaders are created!

--- Grabbing one 'batch' from the train_loader ---
Shape of 'input_ids' batch: torch.Size([16, 128])
Shape of 'attention_mask' batch: torch.Size([16, 128])
Shape of 'labels' batch:    torch.Size([16, 2])

This means the batch has 16 items (our batch size),
and each item has 128 token IDs.

Step 6 (Advanced 3) Complete. We have 'train_loader' and 'val_loader'.


In [None]:
from transformers import AutoModel
import torch.nn as nn

print("Defining the *Simple* AI Model")

# This 'try' block will only run if our previous steps exist
try:
    
    # 1. DEFINE OUR SIMPLE MODEL CLASS
    # We inherit from torch.nn.Module, the base class for all PyTorch models
    
    class EmotionRegressorModel(nn.Module):
        
        # The 'init' runs once when we create the model
        def __init__(self, model_name):
            super(EmotionRegressorModel, self).__init__()
            
            # --- Part 1: The "Body" ---
            # Load the pre-trained DistilBERT model
            # This is the "brain" that understands language
            self.bert_body = AutoModel.from_pretrained(model_name, use_safetensors=True)
            
            # --- Part 2: The "Head" ---
            # We add our custom "adapter" on top of the body
            # DistilBERT's output is 768 features (self.bert_body.config.hidden_size)
            # We want to map this down to 2 outputs (valence and arousal)
            self.regression_head = nn.Linear(
                self.bert_body.config.hidden_size, # Input: 768
                2                                  # Output: 2
            )
        
        # The 'forward' method defines how data flows *through* the model
        def forward(self, input_ids, attention_mask):
            
            # 1. Pass the data through the "body"
            # We get back all the "hidden states" (the model's "thoughts")
            outputs = self.bert_body(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # 2. Get the *first* "thought"
            # We use the output of the very first [CLS] token
            # This 'last_hidden_state[:, 0]' is a standard way to get a
            # single vector that represents the *entire* sentence.
            cls_output = outputs.last_hidden_state[:, 0]
            
            # 3. Pass that one vector through our "head"
            predictions = self.regression_head(cls_output)
            
            # 4. Return the final 2-number prediction
            return predictions

    
    # 2. CHECK FOR A GPU (THIS IS IMPORTANT!)
    # Training on a GPU is 100x faster than a CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    
    # 3. CREATE AN INSTANCE OF OUR MODEL
    print("Creating an instance of EmotionRegressorModel...")
    # This will load the pre-trained 'distilbert-base-uncased' weights
    # (This may take a moment to download if it's your first time)
    advanced_model = EmotionRegressorModel(model_name=MODEL_NAME)
    
    # 4. MOVE THE MODEL TO THE GPU (if we have one)
    advanced_model.to(device)
    
    print("\n--- MODEL CREATED SUCCESSFULLY! ---")
    print(f"The 'regression_head' now expects {advanced_model.bert_body.config.hidden_size} input features.")
    
    print("\nStep 7 (Advanced 4) Complete. We have 'advanced_model' and 'device'.")
    

except NameError as e:
    print(f"\nERROR: A variable was not found. Did you run all previous steps? {e}")
except ImportError:
    print("\nERROR: 'transformers' or 'torch' library not found.")
    print("Please make sure you have it installed: pip install transformers torch")
except Exception as e:
    print(f"An error occurred: {e}")


--- Advanced Step 4: Defining the *Simple* AI Model ---
Using device: cuda
Creating an instance of EmotionRegressorModel...

--- MODEL CREATED SUCCESSFULLY! ---
The 'regression_head' now expects 768 input features.

Step 7 (Advanced 4) Complete. We have 'advanced_model' and 'device'.


In [None]:
from torch.optim import AdamW
import torch.nn as nn # We need this for the loss function

print("Defining Loss and Optimizer")

# This 'try' block will only run if our model and device exist
try:
    
    # -----------------------------------------------------------------
    # --- TUNING KNOBS: This is where we "fix parameters"! ---
    # -----------------------------------------------------------------
    # We'll start with the settings that gave us our first good score
    LEARNING_RATE = 1e-5  # (0.00001)
    NUM_EPOCHS = 3        # We'll run 3 full passes to start
    # -----------------------------------------------------------------
    
    
    # 1. DEFINE THE "MEASURING TAPE" (LOSS FUNCTION)
    loss_function = nn.MSELoss()
    
    
    # 2. DEFINE THE "WRENCH" (OPTIMIZER)
    # We tell AdamW *what* to fix (advanced_model.parameters())
    # and *how much* to fix it (lr=LEARNING_RATE)
    optimizer = AdamW(
        advanced_model.parameters(),
        lr=LEARNING_RATE
    )
    
    print("Loss function (MSE) and Optimizer (AdamW) are created.")
    print(f"We will train for {NUM_EPOCHS} epochs.")
    print(f"Our learning rate is {LEARNING_RATE}.")
    
    print("\nStep 8 (Advanced 5) Complete. We are 100% ready to train.")

except NameError as e:
    print(f"\nERROR: A variable was not found. Did you run all previous steps? {e}")
except Exception as e:
    print(f"An error occurred: {e}")


--- Advanced Step 5: Defining Loss and Optimizer ---
Loss function (MSE) and Optimizer (AdamW) are created.
We will train for 3 epochs.
Our learning rate is 1e-05.

Step 8 (Advanced 5) Complete. We are 100% ready to train.


In [None]:
import numpy as np
from tqdm import tqdm # For our nice progress bars!
from scipy.stats import pearsonr # For our final score!

print("The Training & Evaluation Loop")

# This 'try' block will only run if everything else exists
try:
    
    # We'll use these to track our best score at the end
    best_val_loss = float('inf')
    best_epoch_preds = []
    best_epoch_labels = []
    
    # 1. --- THE MAIN LOOP ---
    for epoch in range(NUM_EPOCHS):
        
        print(f"\n--- Epoch {epoch + 1} / {NUM_EPOCHS} ---")
        
        # --- TRAINING PHASE ---
        advanced_model.train()
        total_train_loss = 0
        for batch in tqdm(train_loader, desc="  Training Batches"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            predictions = advanced_model(input_ids, attention_mask)
            loss = loss_function(predictions, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            
        avg_train_loss = total_train_loss / len(train_loader)

        
        # --- VALIDATION PHASE ---
        advanced_model.eval()
        total_val_loss = 0
        current_preds = []
        current_labels = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="  Validation Batches"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                predictions = advanced_model(input_ids, attention_mask)
                loss = loss_function(predictions, labels)
                total_val_loss += loss.item()
                
                current_preds.append(predictions.cpu().numpy())
                current_labels.append(labels.cpu().numpy())
                
        avg_val_loss = total_val_loss / len(val_loader)
        
        print(f"\nEpoch {epoch+1} Complete:")
        print(f"  Average Training Loss: {avg_train_loss:.4f}")
        print(f"  Average Validation Loss: {avg_val_loss:.4f}")
        
        # We just check the loss, we don't save the model here
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            print("  (This is the new best validation loss so far!)")
            # We'll save the *predictions* from this best epoch
            best_epoch_preds = current_preds
            best_epoch_labels = current_labels
            
    # 2. --- FINAL EVALUATION (AFTER ALL EPOCHS) ---
    print("\n--- Training Complete! ---")
    print("Calculating final scores using the BEST epoch's results...")
    
    # 'vstack' stacks all our saved batches vertically into one big array
    final_preds = np.vstack(best_epoch_preds)
    final_labels = np.vstack(best_epoch_labels)
    
    real_valence = final_labels[:, 0]
    real_arousal = final_labels[:, 1]
    pred_valence = final_preds[:, 0]
    pred_arousal = final_preds[:, 1]
    
    # 3. CALCULATE PEARSON'S R!
    r_valence = pearsonr(real_valence, pred_valence)[0]
    r_arousal = pearsonr(real_arousal, pred_arousal)[0]
    
    print("\n--- FINAL ADVANCED MODEL RESULTS ---")
    print(f"  (Model: {MODEL_NAME}, LR: {LEARNING_RATE}, Epochs: {NUM_EPOCHS})")
    print("------------------------------------------")
    print("--- Pearson's Correlation (r) ---")
    print(f"  Valence: {r_valence:.4f}")
    print(f"  Arousal: {r_arousal:.4f}")
    
    print("The variable 'advanced_model' in memory is now fully trained.")


except NameError as e:
    print(f"\nERROR: A variable was not found. Did you run all previous steps? {e}")
except Exception as e:
    print(f"An error occurred: {e}")


--- Advanced Step 6: The Training & Evaluation Loop (FIXED) ---
(This cell will *only* train the model. Saving is in the next cell.)

--- Epoch 1 / 3 ---


  Training Batches: 100%|██████████| 132/132 [00:06<00:00, 20.03it/s]
  Validation Batches: 100%|██████████| 33/33 [00:00<00:00, 67.58it/s]



Epoch 1 Complete:
  Average Training Loss: 1.4959
  Average Validation Loss: 1.4978
  (This is the new best validation loss so far!)

--- Epoch 2 / 3 ---


  Training Batches: 100%|██████████| 132/132 [00:06<00:00, 20.05it/s]
  Validation Batches: 100%|██████████| 33/33 [00:00<00:00, 68.50it/s]



Epoch 2 Complete:
  Average Training Loss: 1.3587
  Average Validation Loss: 1.4366
  (This is the new best validation loss so far!)

--- Epoch 3 / 3 ---


  Training Batches: 100%|██████████| 132/132 [00:06<00:00, 20.09it/s]
  Validation Batches: 100%|██████████| 33/33 [00:00<00:00, 68.72it/s]


Epoch 3 Complete:
  Average Training Loss: 1.2517
  Average Validation Loss: 1.4444

--- Training Complete! ---
Calculating final scores using the BEST epoch's results...

--- FINAL ADVANCED MODEL RESULTS ---
  (Model: distilbert-base-uncased, LR: 1e-05, Epochs: 3)
------------------------------------------
--- Pearson's Correlation (r) ---
  Valence: 0.3079
  Arousal: 0.3223

Step 9 (Advanced 6) Complete.
The variable 'advanced_model' in memory is now fully trained.



