In [1]:
!pip install torch peft datasets accelerate
!pip install transformers



In [2]:
import numpy as np 
import pandas as pd 
import math
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, DataCollatorWithPadding
import torch

In [3]:
data = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
data.head()
data.loc[data['MisconceptionId'] == 1180]

Unnamed: 0,MisconceptionId,MisconceptionName
1180,1180,Does not know the properties of a rectangle


In [6]:
train = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
train.head(10)

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
0,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,,,,1672.0
1,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142.0,143.0,2142.0,
2,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,1287.0,,1287.0,1073.0
3,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with ...,acute,obtuse,\( 90^{\circ} \),Not enough information,1180.0,1180.0,,1180.0
4,4,3387,Substitute positive integer values into formul...,67,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find...,\( 30 \),\( 27 \),\( 51 \),\( 24 \),,,,1818.0
5,5,2052,Identify a unit of area,75,Area of Simple Shapes,D,James has answered a question on the area of a...,\( m \),\( \mathrm{cm} \),\( \mathrm{km}^{3} \),\( \mathrm{mm}^{2} \),686.0,686.0,686.0,
6,6,376,Convert two digit integer percentages to fract...,238,Converting between Fractions and Percentages,B,Convert this percentage to a fraction\n\( 62 \...,\( \frac{62}{10} \),\( \frac{31}{50} \),\( \frac{6}{2} \),None of these,329.0,,847.0,329.0
7,7,314,Divide decimals by 10,224,Multiplying and Dividing with Decimals,A,\( 43.2 \div 10= \),\( 4.32 \),\( 0.432 \),\( 33.2 \),\( 43.02 \),,2123.0,2273.0,2133.0
8,8,435,Subtract proper fractions with different denom...,230,Adding and Subtracting Fractions,A,\(\n\frac{4}{5}-\frac{1}{3}=\frac{\bigstar}{15...,\( 7 \),\( 5 \),\( 17 \),\( 3 \),,907.0,1514.0,907.0
9,9,1321,Identify horizontal translations in the form f...,164,Transformations of functions in the form f(x),C,What transformation maps the graph of\n\(y=f(x...,Translation by vector\n\(\n\left[\begin{array}...,Translation by vector\n\(\n\left[\begin{array}...,Translation by vector\n\(\n\left[\begin{array}...,Translation by vector\n\(\n\left[\begin{array}...,1889.0,1234.0,,1312.0


In [7]:
len(train)

1869

In [8]:
train["QuestionText"][10]

'John is expanding these three brackets:\n\\(\n(x+5)(x-3)(x+6)\n\\)\n\nHe finds it helpful to split the first bracket and view the problem as follows:\n\\(\nx(x-3)(x+6)+5(x-3)(x+6)\n\\)\n\nHe expands the brackets further. Before he simplifies, what are the terms than contain an \\( \\boldsymbol{x}^{2} \\) ?'

In [9]:
test = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
test.head()

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText
0,1869,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets
1,1870,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify
2,1871,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct


In [10]:
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



## Instruction Fine Tuning

In [11]:
# Generate prompts for each wrong answer
prompts = []
labels  = []
misconception_lists = ['MisconceptionAId','MisconceptionBId','MisconceptionCId','MisconceptionDId']
answers_list        = ['AnswerAText','AnswerBText','AnswerCText','AnswerDText']
for index in range(4) :
    minsconception_name = misconception_lists[index]
    for _, row in train.iterrows():
        construct_name = row['ConstructName']
        question = row['QuestionText']
        req_col = 'AnswerAText'
        if row['CorrectAnswer'] == 'B':
            req_col = 'AnswerBText'
        elif row['CorrectAnswer'] == 'C':
            req_col = 'AnswerCText'
        elif row['CorrectAnswer'] == 'D':
            req_col = 'AnswerDText'
        correct_answer = row[req_col]
        subject = row['SubjectName']
        wrong_answer = row[answers_list[index]]
        misconception_id = row[misconception_lists[index]]
 
        if  pd.notna(misconception_id) :  
            prompt = f"""
            I am giving you question and its correct answer.
            But student gave the wrong answer because of some missconception
            to which misconception_id is assigned, misconception_id is a number.

            Construct name is ---> {construct_name}
            Question is ---> {question},
            subject is  ---> {subject},
            correct answer is ---> {correct_answer}
            ________________________________________
            But student gave the wrong answer ---> {wrong_answer}
            misconception_id is ---> ?
            """
            prompts.append(prompt)
            labels.append(str(misconception_id))
    
# Example prompts for inspection
for prompt in prompts[:2]: 
    print(prompt)


            I am giving you question and its correct answer.
            But student gave the wrong answer because of some missconception
            to which misconception_id is assigned, misconception_id is a number.

            Construct name is ---> Simplify an algebraic fraction by factorising the numerator
            Question is ---> Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \),
            subject is  ---> Simplifying Algebraic Fractions,
            correct answer is ---> Does not simplify
            ________________________________________
            But student gave the wrong answer ---> \( m+1 \)
            misconception_id is ---> ?
            

            I am giving you question and its correct answer.
            But student gave the wrong answer because of some missconception
            to which misconception_id is assigned, misconception_id is a number.

            Construct name is ---> Calculate the range from a list of data
          

In [12]:
# Create a pandas DataFrame with input and output
dataset = pd.DataFrame({
    "input_text": prompts,
    "labels": labels
})

# Check the first few rows of the dataset
print(dataset.head())

                                          input_text  labels
0  \n            I am giving you question and its...  2142.0
1  \n            I am giving you question and its...  1287.0
2  \n            I am giving you question and its...  1180.0
3  \n            I am giving you question and its...   686.0
4  \n            I am giving you question and its...   329.0


In [13]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",  # Sequence-to-sequence task
    inference_mode=False,
    r=16,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.1,
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

# Print model summary
lora_model.print_trainable_parameters()


trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096


In [14]:
from datasets import Dataset

# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Split into train and validation sets (80-20 split)
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

# Inspect the dataset
print(hf_dataset)


DatasetDict({
    train: Dataset({
        features: ['input_text', 'labels'],
        num_rows: 3496
    })
    test: Dataset({
        features: ['input_text', 'labels'],
        num_rows: 874
    })
})


In [15]:
# Tokenize the dataset
def tokenize_function(dataset):
    inputs = tokenizer(dataset["input_text"], truncation=True, 
                       padding="max_length", max_length=216, return_tensors='pt')
    labels = tokenizer(dataset["labels"], truncation=True, 
                       padding="max_length", max_length=216, return_tensors='pt')
    inputs["labels"] = labels["input_ids"]  # Set labels as decoder input IDs
    return inputs

# Tokenize the dataset
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/3496 [00:00<?, ? examples/s]

Map:   0%|          | 0/874 [00:00<?, ? examples/s]

In [16]:
from datasets import DatasetDict

# Select only the necessary columns
train_dataset = tokenized_dataset['train'].map(lambda examples: {
    "input_ids": examples["input_ids"],
    "attention_mask": examples["attention_mask"],
    "labels": examples["labels"],
}, remove_columns=["input_text"])

test_dataset = tokenized_dataset['test'].map(lambda examples: {
    "input_ids": examples["input_ids"],
    "attention_mask": examples["attention_mask"],
    "labels": examples["labels"],
}, remove_columns=["input_text"])


Map:   0%|          | 0/3496 [00:00<?, ? examples/s]

Map:   0%|          | 0/874 [00:00<?, ? examples/s]

In [17]:

train_dataset.set_format("torch")
test_dataset.set_format("torch")
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 3496
})

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
train_dataset['labels'].shape

torch.Size([3496, 216])

In [20]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/edi_competition_trained_model",  # Directory to save the model
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate every 500 steps
    logging_steps=100,  # Log progress every 100 steps
    save_steps=100,  # Save model every 500 steps
    learning_rate=1e-4,  # Learning rate
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs
    weight_decay=0.01,  # Weight decay for optimization
    save_total_limit=2,  # Limit the number of saved checkpoints
    predict_with_generate=True,  # Use the model to generate predictions
    fp16=True,  # Mixed precision training
    report_to="none"  # Disable reporting to external tools like WandB
)

# Trainer
trainer = Seq2SeqTrainer(
    model=lora_model.to(device),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()




Step,Training Loss,Validation Loss


KeyboardInterrupt: 