In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/debertav3small/spm.model
/kaggle/input/debertav3small/config.json
/kaggle/input/debertav3small/README.md
/kaggle/input/debertav3small/tf_model.h5
/kaggle/input/debertav3small/tokenizer_config.json
/kaggle/input/debertav3small/pytorch_model.bin
/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv
/kaggle/input/map-charting-student-math-misunderstandings/train.csv
/kaggle/input/map-charting-student-math-misunderstandings/test.csv
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/spm.model
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/config.json
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/README (1).md
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/README.md
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/tokenizer_config.json
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/tokenizer_config (1).json
/kaggle/input

In [20]:
# --- IMPORTANT: BEGINNING OF 0. INITIAL SETUP & CONFIGURATION CELL ---

# Install necessary libraries (usually pre-installed on Kaggle)
# !pip install -q transformers pandas scikit-learn numpy torch datasets accelerate sentencepiece

import pandas as pd
import numpy as np
import torch
import gc # For garbage collection
import os, random

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

# --- CRITICAL FIX HERE ---
# Make sure to import Dataset from the 'datasets' library, not torch.utils.data
from datasets import Dataset # <<< --- CHANGED THIS LINE
# If you also need PyTorch's DataLoader, it's typically just 'from torch.utils.data import DataLoader'
# But for Trainer, it handles DataLoaders internally, so we don't strictly need it here.


# --- Configuration Constants ---
MODEL_NAME = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base/'
MAX_LEN = 256
BATCH_SIZE = 16
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01
SEED = 42

# Set random seeds for reproducibility
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# Set device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print("Setup and Configuration Complete!")

# --- END OF 0. INITIAL SETUP & CONFIGURATION CELL ---

Using device: cuda
Setup and Configuration Complete!


In [21]:
import os, random # Added random for seeding

# Define paths to competition data (Kaggle Notebooks mount data to /kaggle/input/)
TRAIN_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'
SAMPLE_SUBMISSION_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv'

# Load data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION_PATH)

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}") # Should be (3671, X)
print(f"Sample Submission shape: {sample_submission_df.shape}")

# Fill NA values in Category and Misconception for consistency
train_df.Misconception = train_df.Misconception.fillna("NA")
train_df["target"] = train_df.Category + ":" + train_df.Misconception

# --- Calculate Correctness Feature ---
# Identify rows where Category indicates a 'True' answer
idx_true = train_df.Category.str.startswith("True")

# Count how often a specific MC_Answer is chosen when the Category is True for a given QuestionId
correct_counts = (
    train_df[idx_true]
    .groupby(["QuestionId", "MC_Answer"])
    .MC_Answer.agg("count")
    .reset_index(name="count_correct_answers") # Renamed for clarity
    .sort_values("count_correct_answers", ascending=False)
    .drop_duplicates(["QuestionId"]) # Get the top MC_Answer for each question if multiple 'True'
)
correct_counts["is_correct"] = 1 # Mark these as correct choices

# Merge this correctness feature into the training data
train_df = train_df.merge(correct_counts[["QuestionId", "MC_Answer", "is_correct"]],
                          on=["QuestionId", "MC_Answer"], how="left")
train_df.is_correct = train_df.is_correct.fillna(0).astype(int) # Fill NaNs (if MC_Answer wasn't true) with 0

# Create a dictionary for quick lookup of correctness for the test set
dict_corr = correct_counts.set_index(["QuestionId", "MC_Answer"])["is_correct"].to_dict()

# Apply to test data as well
test_df["is_correct"] = test_df.apply(lambda r: int(dict_corr.get((r.QuestionId, r.MC_Answer), 0)), axis=1)

print("\n--- Correctness Feature Example (Train Data) ---")
print(train_df[['QuestionId', 'MC_Answer', 'Category', 'is_correct']].head())
print("\n--- Correctness Feature Example (Test Data) ---")
print(test_df[['QuestionId', 'MC_Answer', 'is_correct']].head())


# --- Prompt Engineering Function ---
def build_prompt(row):
    """
    Constructs the input text for the model, including the new correctness feature.
    """
    correctness = "correct." if row.is_correct else "incorrect."
    return (
        f"Question: {row.QuestionText}\n"
        f"Answer: {row.MC_Answer}\n"
        f"This answer is {correctness}\n" # <<< --- New context for the model
        f"Student Explanation: {row.StudentExplanation}"
    )

# Apply prompt engineering to both train and test data
train_df["text"] = train_df.apply(build_prompt, axis=1)
test_df["text"] = test_df.apply(build_prompt, axis=1)

print("\n--- Example of New Processed Text with Correctness Feature (Train Data) ---")
print(train_df['text'].iloc[0])
print("\nData Loading and Feature Engineering Complete!")

Train data shape: (36696, 7)
Test data shape: (3, 5)
Sample Submission shape: (3, 2)

--- Correctness Feature Example (Train Data) ---
   QuestionId          MC_Answer      Category  is_correct
0       31772  \( \frac{1}{3} \)  True_Correct           1
1       31772  \( \frac{1}{3} \)  True_Correct           1
2       31772  \( \frac{1}{3} \)  True_Neither           1
3       31772  \( \frac{1}{3} \)  True_Neither           1
4       31772  \( \frac{1}{3} \)  True_Correct           1

--- Correctness Feature Example (Test Data) ---
   QuestionId          MC_Answer  is_correct
0       31772  \( \frac{1}{3} \)           1
1       31772  \( \frac{3}{6} \)           0
2       32835          \( 6.2 \)           1

--- Example of New Processed Text with Correctness Feature (Train Data) ---
Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]
Answer: \( \frac{1}{3} \)
This ans

In [22]:
le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["target"])
NUM_CLASSES = len(le.classes_) # Total number of unique Category:Misconception classes
print(f"Unique target classes: {NUM_CLASSES}")

# Store the classes for inverse transformation later
id_to_label = {i: label for i, label in enumerate(le.classes_)}

print("Target Label Preparation Complete (using LabelEncoder)!")

Unique target classes: 65
Target Label Preparation Complete (using LabelEncoder)!


In [23]:
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
collator  = DataCollatorWithPadding(tokenizer) # Data collator for padding batches

# Function to tokenize a batch of texts
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

# Robust stratified split (handles singleton classes)
# While a single split is shown, this setup supports full K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
train_idx, val_idx = next(skf.split(train_df, train_df["label"])) # Use train_df["label"] for stratification

df_train = train_df.iloc[train_idx].reset_index(drop=True)
df_val   = train_df.iloc[val_idx].reset_index(drop=True)

# Create Hugging Face Dataset objects
train_ds = Dataset.from_pandas(df_train[["text", "label"]]).map(tokenize, batched=True, remove_columns=["text"])
val_ds   = Dataset.from_pandas(df_val[["text", "label"]]).map(tokenize, batched=True, remove_columns=["text"])

# Set format for PyTorch
cols = ["input_ids", "attention_mask", "label"]
train_ds.set_format("torch", columns=cols)
val_ds.set_format("torch", columns=cols)

print("Tokenization and Dataset Creation Complete!")

Loading tokenizer…




Map:   0%|          | 0/29356 [00:00<?, ? examples/s]

Map:   0%|          | 0/7340 [00:00<?, ? examples/s]

Tokenization and Dataset Creation Complete!


In [None]:
#### **4. Model Initialization & Training**

# This step initializes the model and runs the training process. The custom `map3_metric` is also defined here.

print("Loading model…")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_CLASSES # Number of unique classes from LabelEncoder
).to(device)

print(f"\nModel loaded: {MODEL_NAME} with {NUM_CLASSES} output labels.")


# MAP@3 metric function from the high-scoring notebook
def map3_metric(eval_pred):
    logits, labels = eval_pred
    # Apply softmax to get probabilities for each class
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()

    # Get the top 3 predicted class indices for each sample
    top3_preds_indices = np.argsort(-probs, axis=1)[:, :3]

    # Initialize a list to store MAP@3 score for each sample
    map_scores = []
    for i in range(len(labels)):
        true_label = labels[i] # The single true label for the current sample
        current_map = 0.0
        # Check if the true label is in the top 3 predictions
        for rank, pred_idx in enumerate(top3_preds_indices[i]):
            if pred_idx == true_label:
                # Add 1/ (rank+1) if the true label is found
                current_map = 1.0 / (rank + 1.0)
                break # Stop as soon as the true label is found
        map_scores.append(current_map)

    return {"map@3": np.mean(map_scores)} # Return the mean of all samples' MAP@3 scores


# Define Training Arguments
# output_dir must be /kaggle/working/ for persistence in Kaggle Notebooks
args = TrainingArguments(
    output_dir="./checkpoints", # This will be created in /kaggle/working/
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    load_best_model_at_end=True, # Load the best model based on metric_for_best_model
    metric_for_best_model="map@3",
    greater_is_better=True, # Higher MAP@3 is better
    seed=SEED,
    report_to="none",
    save_strategy="epoch", # Save every epoch
    save_total_limit=1, # Keep only the best model
    eval_strategy="epoch", # <<< --- CORRECTED: Use 'eval_strategy' instead of 'evaluation_strategy'
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=100,
    fp16=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds, # eval_dataset is still provided for validation
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=map3_metric, # Our custom MAP@3 metric
)

# Train the model
print("\nStarting model training...")
trainer.train()
print("Model training complete!")

# Save the final best model (or last model if load_best_model_at_end=False)
# It will be saved inside the output_dir, i.e., /kaggle/working/checkpoints/
trainer.save_model("./best_model")
import joblib; joblib.dump(le, "./label_encoder.joblib") # Save the LabelEncoder too

# Clean up memory after training
del model
del trainer
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleaned up after training.")

Loading model…


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-base/ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model loaded: /kaggle/input/huggingfacedebertav3variants/deberta-v3-base/ with 65 output labels.

Starting model training...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
import joblib # Ensure joblib is imported for loading the LabelEncoder

print("\nInference on test set…")

# Load the saved best model for inference
# The model is saved to /kaggle/working/best_model
model = AutoModelForSequenceClassification.from_pretrained(
    "./best_model", # Load from the saved path
    num_labels=NUM_CLASSES
).to(device)

# Load the saved LabelEncoder
le = joblib.load("./label_encoder.joblib")


# Test data preprocessing (already done in Step 1, but confirm 'text' column exists)
# The test data's `is_correct` and `text` columns should already be populated from Step 1

# Create Hugging Face Dataset for test data
test_ds = Dataset.from_pandas(test_df[["text"]]).map(lambda b: tokenizer(b["text"], truncation=True, padding="max_length", max_length=MAX_LEN), batched=True, remove_columns=["text"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask"])

# Create a DataLoader for the test set
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator) # Use collator here too

model.eval() # Set model to evaluation mode
all_probs = []

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad(): # Disable gradient calculation for inference
        logits = model(**batch).logits
    all_probs.append(torch.softmax(logits, dim=-1).cpu().numpy()) # Apply softmax and move to CPU

probs = np.vstack(all_probs) # Stack all probabilities into a single NumPy array

print("Inference complete!")
print(f"Shape of probabilities: {probs.shape}")

# Clean up memory after inference
del model
del test_loader
del test_ds
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleaned up after inference.")

In [None]:
# Get top-3 predictions for each row
top3_indices = np.argsort(-probs, axis=1)[:, :3]

# Convert indices back to original Category:Misconception labels
labels_flat = le.inverse_transform(top3_indices.flatten()).reshape(top3_indices.shape)

# Join the top-3 predicted labels with space
joined_predictions = [" ".join(row) for row in labels_flat]

# Create the submission DataFrame
submission_df = pd.DataFrame({
    "row_id": test_df.row_id, # Use original row_id from test_df
    "Category:Misconception": joined_predictions
})

# Save the submission file to the Kaggle working directory
submission_file_name = "submission.csv"
submission_df.to_csv(f"/kaggle/working/{submission_file_name}", index=False)

print(f"Saved {submission_file_name} ✅")
print("\nFirst 5 rows of submission.csv:")
print(submission_df.head())
print(f"\nTotal rows in submission.csv: {len(submission_df)}")