In [1]:
!pip install transformers torch pandas numpy sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [21]:
# --- Step 0: Initial Setup and Library Installation ---
# This part ensures you have all necessary libraries in your Colab environment.

# If you encounter issues with transformers version or dependencies,
# you might need to restart the runtime after installation.
# For example, after installing accelerate, you might get a prompt to restart.
!pip install -qqq transformers datasets evaluate accelerate scikit-learn pandas numpy torch
!pip install -qqq sentencepiece # DeBERTa-v3 uses SentencePiece tokenizer


import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import average_precision_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import logging as hf_logging
import gc

# Suppress Hugging Face warnings for cleaner output
hf_logging.set_verbosity_error()

print("Libraries installed and imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Clear GPU cache if any to free up VRAM
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hLibraries installed and imported successfully!
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA device name: Tesla T4
Using device: cuda
CUDA cache cleared.


In [22]:
# --- Step 1: Load and Inspect Data ---

# Load the datasets
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    sample_submission_df = pd.read_csv('sample_submission.csv')
    print("train.csv, test.csv, and sample_submission.csv loaded successfully!")
except FileNotFoundError:
    print("Ensure train.csv, test.csv, and sample_submission.csv are uploaded to your Colab environment.")
    print("You can drag and drop them into the 'Files' section on the left sidebar.")
    # Exit or handle error appropriately
    exit() # Exiting for now if files are not found

print("\n--- Train Data Info ---")
train_df.info()
print("\n--- Test Data Info ---")
test_df.info()

print("\n--- First 5 rows of Train Data ---")
print(train_df.head())
print("\n--- First 5 rows of Test Data ---")
print(test_df.head())

print("\n--- Unique Categories and Misconceptions in Train Data ---")
print("Categories:", train_df['Category'].unique())
print("Misconceptions:", train_df['Misconception'].unique())

train.csv, test.csv, and sample_submission.csv loaded successfully!

--- Train Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36696 entries, 0 to 36695
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              36696 non-null  int64 
 1   QuestionId          36696 non-null  int64 
 2   QuestionText        36696 non-null  object
 3   MC_Answer           36696 non-null  object
 4   StudentExplanation  36696 non-null  object
 5   Category            36696 non-null  object
 6   Misconception       9860 non-null   object
dtypes: int64(2), object(5)
memory usage: 2.0+ MB

--- Test Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              3 non-null      int64 
 1   QuestionId          3 non-null      int64 
 

In [24]:
# --- Step 2: Prepare Target Labels (Corrected) ---

# Ensure Category and Misconception are strings and handle potential NaNs
train_df['Category'] = train_df['Category'].fillna('').astype(str)
train_df['Misconception'] = train_df['Misconception'].fillna('').astype(str)

# Create combined Category:Misconception labels for training data
# Now, all values in 'Category' and 'Misconception' are guaranteed to be strings.
train_df['Target'] = train_df['Category'] + ':' + train_df['Misconception']

# Get all unique possible Category:Misconception labels from the training data
all_possible_labels = sorted(train_df['Target'].unique().tolist())
print(f"\nTotal unique target labels: {len(all_possible_labels)}")
# print("All possible labels:", all_possible_labels) # Uncomment to see all labels

# Map labels to integers and vice versa
label_to_id = {label: i for i, label in enumerate(all_possible_labels)}
id_to_label = {i: label for i, label in enumerate(all_possible_labels)}

# Convert target labels in train_df to one-hot encoded format
# MultiLabelBinarizer is perfect for this, even if initially we have single labels
# It prepares for a multi-label classification setup, which is necessary for MAP@3
mlb = MultiLabelBinarizer(classes=all_possible_labels)
train_labels_one_hot = mlb.fit_transform(train_df['Target'].apply(lambda x: [x]))

print(f"\nShape of one-hot encoded labels: {train_labels_one_hot.shape}")
# print("Example one-hot label for first row:", train_labels_one_hot[0])

# --- Preprocessing Function and Dataset Class (No change, including for your reference) ---

def preprocess_text(question_text, mc_answer, student_explanation):
    """
    Combines all text inputs into a single string for the transformer model.
    Handles potential NaN values by converting them to empty strings.
    """
    # Ensure all inputs are strings, replace NaN with empty string
    q_text = str(question_text) if pd.notna(question_text) else ""
    mc_ans = str(mc_answer) if pd.notna(mc_answer) else ""
    s_expl = str(student_explanation) if pd.notna(student_explanation) else ""

    # Using a clear separator like [SEP] is good practice for transformers
    # It helps the model distinguish between different parts of the input.
    return f"Question: {q_text} | Answer: {mc_ans} | Explanation: {s_expl}"

# Apply preprocessing to both train and test data
train_df['ProcessedText'] = train_df.apply(
    lambda row: preprocess_text(row['QuestionText'], row['MC_Answer'], row['StudentExplanation']),
    axis=1
)
test_df['ProcessedText'] = test_df.apply(
    lambda row: preprocess_text(row['QuestionText'], row['MC_Answer'], row['StudentExplanation']),
    axis=1
)

print("\n--- Example of Processed Text (Train Data) ---")
print(train_df['ProcessedText'].iloc[0])
print("\n--- Example of Processed Text (Test Data) ---")
print(test_df['ProcessedText'].iloc[0])

class MathMisconceptionDataset(Dataset):
    """
    Custom PyTorch Dataset for handling text tokenization and label preparation.
    """
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels # Labels are already one-hot encoded
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        labels = self.labels[item] # Get the pre-processed one-hot labels

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float) # Labels as float for BCEWithLogitsLoss
        }


Total unique target labels: 65

Shape of one-hot encoded labels: (36696, 65)

--- Example of Processed Text (Train Data) ---
Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.] | Answer: \( \frac{1}{3} \) | Explanation: 0ne third is equal to tree nineth

--- Example of Processed Text (Test Data) ---
Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.] | Answer: \( \frac{1}{3} \) | Explanation: I think that 1/3 is the answer, as it's the simplest form of 3/9.


In [25]:
# --- Step 3: Model Selection, Tokenizer and Data Preparation ---

MODEL_NAME = 'microsoft/deberta-v3-small' # A powerful small model
MAX_LEN = 256 # Max sequence length for tokenizer. Adjust based on text length and GPU memory.
              # 256 is a good starting point for 13GB GPU RAM. Can try 512 if memory allows.
BATCH_SIZE = 16 # Adjust based on GPU memory. 16 is a safe start, try 32 if possible.
NUM_EPOCHS = 3 # Number of training epochs. Start with 3, increase if not overfitting.

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"\nTokenizer loaded: {MODEL_NAME}")

# Create Datasets
train_dataset = MathMisconceptionDataset(
    texts=train_df['ProcessedText'].tolist(),
    labels=train_labels_one_hot,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# For test dataset, labels are not available. We'll use a dummy array or None.
# Here we use a dummy array of zeros, as the model will predict logits anyway.
test_dummy_labels = np.zeros((len(test_df), len(all_possible_labels)))
test_dataset = MathMisconceptionDataset(
    texts=test_df['ProcessedText'].tolist(),
    labels=test_dummy_labels, # Dummy labels for inference
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# --- Step 4: Define Evaluation Metric (MAP@3) ---

def calculate_map_at_3(y_true_one_hot, y_pred_probs, all_labels):
    """
    Calculates Mean Average Precision @ 3 (MAP@3).
    y_true_one_hot: True labels (one-hot encoded).
    y_pred_probs: Predicted probabilities (output from model).
    all_labels: List of all possible Category:Misconception labels.
    """
    map_scores = []
    # Loop through each sample
    for i in range(y_true_one_hot.shape[0]):
        true_labels_idx = np.where(y_true_one_hot[i] == 1)[0]
        # Map true label indices back to actual label strings (there's only one true label per row in our data)
        true_label_str = [all_labels[idx] for idx in true_labels_idx]

        # Get predicted probabilities for this sample and sort them
        pred_probs_for_sample = y_pred_probs[i]
        # Get indices that would sort pred_probs in descending order
        sorted_indices = np.argsort(pred_probs_for_sample)[::-1]

        # Get the predicted label strings in order of confidence
        predicted_labels_str = [all_labels[idx] for idx in sorted_indices]

        # Calculate AP@3 for this sample
        precision_at_k = []
        num_relevant_found = 0
        current_precision_sum = 0
        relevant_found_this_sample = set() # To track if a relevant label has been scored

        for k in range(min(3, len(predicted_labels_str))):
            predicted_label = predicted_labels_str[k]

            if predicted_label in true_label_str and predicted_label not in relevant_found_this_sample:
                num_relevant_found += 1
                relevant_found_this_sample.add(predicted_label)
                # Precision at k: (number of relevant items found up to k) / (k+1)
                current_precision_sum += num_relevant_found / (k + 1)

        if num_relevant_found > 0:
            map_scores.append(current_precision_sum / num_relevant_found)
        else:
            map_scores.append(0.0) # If no relevant items are found in top 3, AP is 0

    return np.mean(map_scores)


# Custom compute_metrics function for Hugging Face Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Apply sigmoid to logits to get probabilities for multi-label classification
    probabilities = torch.sigmoid(torch.from_numpy(logits)).numpy()
    map3 = calculate_map_at_3(labels, probabilities, all_possible_labels)
    return {"map@3": map3}

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]




Tokenizer loaded: microsoft/deberta-v3-small
Train dataset size: 36696
Test dataset size: 3


In [27]:
# --- Step 5: Model Initialization and Training (Corrected) ---

# Load the pre-trained model for sequence classification
# num_labels is the total number of unique Category:Misconception combinations
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(all_possible_labels),
    problem_type="multi_label_classification" # Important for multi-label tasks
)
model.to(device) # Move model to GPU

print(f"\nModel loaded: {MODEL_NAME} with {len(all_possible_labels)} output labels.")

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',                         # Output directory for model checkpoints and logs
    num_train_epochs=NUM_EPOCHS,                    # Total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,         # Batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,          # Batch size per device during evaluation
    warmup_steps=500,                               # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                              # Strength of weight decay
    logging_dir='./logs',                           # Directory for storing logs
    logging_strategy="steps",                       # Log every n steps
    logging_steps=100,                              # Log every 100 steps
    # Correction: Use do_eval=False instead of evaluation_strategy="no"
    do_eval=False,                                  # Set to False to disable evaluation during training for simplicity
                                                    # For real competitions, use True with a validation set
    save_strategy="epoch",                          # Save model checkpoint every epoch
    load_best_model_at_end=False,                   # We'll save the last checkpoint
    fp16=True,                                      # Use mixed precision training for faster training and less memory (if GPU supports)
    report_to="none",                               # Disable reporting to services like WandB
    learning_rate=2e-5,                             # Standard learning rate for fine-tuning transformers
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    # The compute_metrics function is for evaluation, not directly used in training loss
    # The loss function (BCEWithLogitsLoss) is automatically handled by problem_type="multi_label_classification"
)

# Train the model
print("\nStarting model training...")
trainer.train()
print("Model training complete!")

# Clean up memory after training
del model
del trainer
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleaned up after training.")


Model loaded: microsoft/deberta-v3-small with 65 output labels.

Starting model training...


  trainer = Trainer(


{'loss': 0.6912, 'grad_norm': 0.7639238834381104, 'learning_rate': 3.96e-06, 'epoch': 0.043591979075850044}
{'loss': 0.4566, 'grad_norm': 0.8283454775810242, 'learning_rate': 7.960000000000002e-06, 'epoch': 0.08718395815170009}
{'loss': 0.1888, 'grad_norm': 0.3975203335285187, 'learning_rate': 1.196e-05, 'epoch': 0.13077593722755013}
{'loss': 0.0794, 'grad_norm': 0.2142333686351776, 'learning_rate': 1.5960000000000003e-05, 'epoch': 0.17436791630340018}
{'loss': 0.0563, 'grad_norm': 0.28938937187194824, 'learning_rate': 1.9960000000000002e-05, 'epoch': 0.21795989537925023}
{'loss': 0.0509, 'grad_norm': 0.20516745746135712, 'learning_rate': 1.9689752428705737e-05, 'epoch': 0.26155187445510025}
{'loss': 0.0459, 'grad_norm': 0.1870928257703781, 'learning_rate': 1.9376371043560015e-05, 'epoch': 0.3051438535309503}
{'loss': 0.0453, 'grad_norm': 0.17074353992938995, 'learning_rate': 1.9062989658414292e-05, 'epoch': 0.34873583260680036}
{'loss': 0.0421, 'grad_norm': 0.21488620340824127, 'learn

In [30]:
# --- Step 5: Model Initialization and Training (No changes needed, keeping for context) ---
# ... (your existing code for Step 5, just ensure you don't delete model/trainer if you
# plan to run Step 6 immediately after without reloading)

# For clarity and robustness, I will re-load the model in Step 6.
# You can keep the `del model` and `del trainer` at the END of Step 5.
# The important part is that we load the model for prediction in Step 6.
# If you keep `del model` and `del trainer` in Step 5, make sure you run
# Step 5 to completion and then Step 6 in a fresh state (or with reloading as below).

# ... (End of your Step 5 code)
# Clean up memory after training (Keep these lines if you want to free memory after training phase)
# del model
# del trainer
# gc.collect()
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
# print("Memory cleaned up after training.")

# --- Step 6: Prediction on Test Data (Corrected) ---

# Re-load the trained model from the saved checkpoint for prediction
# The Trainer saves checkpoints in the `output_dir` specified in TrainingArguments (e.g., './results')
# It typically saves a directory like './results/checkpoint-XXXX', where XXXX is the global step.
# For simplicity, we'll load the last saved model, which is usually the one at the end of training.
# The path to the saved model is usually `output_dir` itself for the final model or `output_dir/checkpoint-XXXX`
# After trainer.train(), the model weights are stored in `trainer.model`.

# To ensure the model is loaded correctly for prediction even if memory was cleared,
# we will re-initialize it.

# Load the model again (or load from checkpoint if you prefer to save/load)
# The trainer itself doesn't directly save a specific 'best model' path without eval.
# The `model` object from Step 5, after `trainer.train()`, holds the fine-tuned weights.
# If you *did not* run `del model` and `del trainer` in Step 5, the `model` object is still there.
# If you *did* run them, you'd typically load the last saved checkpoint.
# Let's assume for robustness that we always re-load.
# A simple way to get the last checkpoint if you let `save_strategy="epoch"` is to list the output dir.

import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

# Assuming MODEL_NAME, MAX_LEN, BATCH_SIZE, all_possible_labels, test_dataset, tokenizer are still defined
# If you restarted runtime, you might need to re-run imports, model_name, etc.
# For this example, let's assume they are globally defined from previous cells.

# Re-define TrainingArguments to get the output directory path
training_args_for_load = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=BATCH_SIZE, # Only need eval batch size for prediction
    do_eval=False,
    report_to="none",
    # Other args are less critical for just loading and predicting
)

# After training, the best way to ensure you have the trained model is to load
# from the latest checkpoint, especially if you deleted `model` and `trainer`
# The Trainer saves checkpoints in the format `results/checkpoint-XXXX`.
# Let's find the latest checkpoint.

model_path = "./results" # Default path where `trainer.train()` saves its final model if no checkpointing is explicitly used for final save.
# Or, if checkpoints were saved, find the latest one:
list_of_dirs = [d for d in os.listdir('./results') if os.path.isdir(os.path.join('./results', d)) and 'checkpoint' in d]
if list_of_dirs:
    latest_checkpoint = sorted(list_of_dirs, key=lambda x: int(x.split('-')[-1]))[-1]
    model_path = os.path.join('./results', latest_checkpoint)
    print(f"Loading model from latest checkpoint: {model_path}")
else:
    print(f"No specific checkpoint found, loading from default output_dir: {model_path}")


model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(all_possible_labels),
    problem_type="multi_label_classification"
)
model.to(device) # Ensure model is on GPU

# Re-initialize Trainer specifically for prediction
# We don't need a train_dataset here
trainer = Trainer(
    model=model,
    args=training_args_for_load,
    tokenizer=tokenizer,
)

print("\nMaking predictions on test data...")
# predict returns a tuple: (predictions, label_ids, metrics)
predictions = trainer.predict(test_dataset) # Predict using the Trainer directly with test_dataset
logits = predictions.predictions # These are the raw logits from the model

# Convert logits to probabilities using sigmoid
test_probabilities = torch.sigmoid(torch.from_numpy(logits)).numpy()
print("Predictions complete!")
print(f"Shape of probabilities: {test_probabilities.shape}")

# Clean up memory after prediction (can keep these here)
del predictions
del trainer # Delete trainer as its job is done
del model # Delete model after predictions are extracted
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleaned up after prediction.")

# --- Step 7: Post-processing Predictions for Submission (MAP@3 format) ---
# This part remains the same as before

# This list will store the formatted predictions for the submission file
submission_predictions = []

print("\nPost-processing predictions for submission file...")

for i in range(test_probabilities.shape[0]):
    # Get probabilities for the current test sample
    probs_for_sample = test_probabilities[i]

    # Get the indices that would sort probabilities in descending order
    sorted_indices = np.argsort(probs_for_sample)[::-1]

    # Initialize a list for current sample's top predictions
    current_sample_preds = []
    num_added = 0
    added_labels = set() # To ensure unique labels are added

    for idx in sorted_indices:
        predicted_label_str = id_to_label[idx] # Convert index back to Category:Misconception string

        # Only add if we haven't reached 3 predictions and it's a unique label for this sample
        if num_added < 3 and predicted_label_str not in added_labels:
            current_sample_preds.append(predicted_label_str)
            added_labels.add(predicted_label_str)
            num_added += 1
        elif num_added >= 3:
            break # Stop once we have 3 predictions

    # Join the predicted labels with space
    submission_predictions.append(" ".join(current_sample_preds))

print("Post-processing complete.")

# --- Step 8: Generate Submission File ---
# This part remains the same as before

submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'Category:Misconception': submission_predictions
})

# Save the submission file
submission_file_name = 'submission.csv'
submission_df.to_csv(submission_file_name, index=False)

print(f"\nSubmission file '{submission_file_name}' created successfully!")
print("First 5 rows of submission.csv:")
print(submission_df.head())

Loading model from latest checkpoint: ./results/checkpoint-6882


  trainer = Trainer(



Making predictions on test data...
Predictions complete!
Shape of probabilities: (3, 65)
Memory cleaned up after prediction.

Post-processing predictions for submission file...
Post-processing complete.

Submission file 'submission.csv' created successfully!
First 5 rows of submission.csv:
   row_id                             Category:Misconception
0   36696  True_Correct: True_Neither: True_Misconception...
1   36697  False_Misconception:WNB False_Misconception:In...
2   36698  True_Neither: True_Correct: True_Misconception...
