<a href="https://colab.research.google.com/github/ShacharYonai/DicatLM-FineTune/blob/main/fine_tune_llm_eedi_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bitsandbytes flash-attn accelerate trl peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting flash-attn
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trl
  Downloading trl-0.11.4-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets (from trl)
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.13-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->trl)
  Downloading dill-0.3.8-py3

In [36]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1


In [37]:
from sentence_transformers.cross_encoder import CrossEncoder

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import Callable, Any, Tuple, List
import torch
from torch.utils.data import Dataset
from dataclasses import dataclass
import os
import re
from collections import Counter
from pathlib import Path

In [3]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [4]:
from transformers import (AutoTokenizer,
                          AutoModel,
                          DataCollatorWithPadding,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          DataCollatorForLanguageModeling,
                          TrainingArguments,
                          AutoModelForCausalLM,
                          Trainer,
                          TextStreamer)

In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [6]:
misconseption = pd.read_csv("/content/data/misconception_mapping.csv")
sample_data = pd.read_csv("/content/data/sample_submission.csv")
train = pd.read_csv("/content/data/train.csv")
test = pd.read_csv("/content/data/test.csv")

In [7]:
def reshape_wide_to_long(df: pd.DataFrame, id_vars: list[str], value_vars: list[str], var_name: str, value_name: str, sorted_vars: list[str]) -> pd.DataFrame:
    df_reshape = df.melt(id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name)
    return df_reshape.sort_values(by=sorted_vars).reset_index(drop=True)

In [8]:
# select the variables for reshaping the misconception
train_tmp_mis = train.loc[:, ["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText",
                              *[var for var in train.columns if var.startswith('Mis')]]]

In [9]:
# select the variables for reshaping the answer
train_tmp_answer = train.loc[:, ["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText",
                              *[var for var in train.columns if var.startswith('Answer')]]]

In [10]:
train_misconception_reshaped = reshape_wide_to_long(df=train_tmp_mis,
                                                    id_vars=["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText"],
                                                    value_vars=[var for var in train.columns if var.startswith('Mis')],
                                                    var_name="misconception",
                                                    value_name="misconception_id",
                                                    sorted_vars=["QuestionId","misconception"])

In [11]:
train_answer_reshaped = reshape_wide_to_long(df=train_tmp_answer,
                                             id_vars=["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText"],
                                             value_vars=[var for var in train.columns if var.startswith('Answer')],
                                             var_name="answer",
                                             value_name="answer_id",
                                             sorted_vars=["QuestionId","answer"])

In [12]:
# concat the two data frames
train_reshaped = pd.concat([train_misconception_reshaped, train_answer_reshaped.loc[:, ["answer","answer_id"]]], axis=1)

In [13]:
# add the misconceptions texts to the reshaped train data
train_reshaped = train_reshaped.merge(misconseption, how="left", left_on="misconception_id", right_on="MisconceptionId")
train_reshaped.drop("MisconceptionId", axis=1, inplace=True)

In [14]:
# drop all the cases where misconception were not assigned
train_reshaped = train_reshaped[~train_reshaped["misconception_id"].isna()].reset_index(drop=True)

In [15]:
train_reshaped.drop(["CorrectAnswer","misconception","misconception_id","answer"], axis=1, inplace=True)

In [16]:
train_reshaped = train_reshaped.assign(txt=lambda x:
                                       "<name>\n" +
                                       x["ConstructName"] +
                                       "\n</name>\n" +
                                       "\n<subject>\n" +
                                       x["SubjectName"] +
                                       "\n</subject>\n" +
                                       "\n<question>\n" +
                                       x["QuestionText"] +
                                       "\n</question>\n" +
                                      "\n<answer>\n" +
                                      x["answer_id"] +
                                       "\n</answer>\n" +
                                      "\n<misconception>\n" +
                                      x["MisconceptionName"] +
                                      "\n</misconception>")

In [17]:
print(train_reshaped["txt"][0])

<name>
Use the order of operations to carry out calculations involving powers
</name>

<subject>
BIDMAS
</subject>

<question>
\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ?
</question>

<answer>
Does not need brackets
</answer>

<misconception>
Confuses the order of operations, believes addition comes before multiplication 
</misconception>


In [18]:
# shufle the data
train_reshaped = train_reshaped.sample(n=train_reshaped.shape[0], replace=False, random_state=42).reset_index(drop=True)
train, valid = train_test_split(train_reshaped, test_size=0.1, random_state=42)

In [20]:
train_prompts = train["txt"].to_list()
valid_prompts = valid["txt"].to_list()

In [None]:
model_id = "Qwen/Qwen2.5-7B"  # "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
# create the Dataset templete
class PromptTemplet(Dataset):
  def __init__(self,
               prompts: list[str],
               tokenizer: AutoTokenizer):
    super().__init__()
    self.prompts = prompts
    self.tokenizer = tokenizer

  def __len__(self) -> int:
    """return the total number of samples"""
    return len(self.prompts)

  def __getitem__(self, index: int):
    """return one sample of image data and label"""
    prompt = self.prompts[index]
    score = tokenizer(prompt, return_tensors='pt')  #  max_length=512, padding='max_length',
    return {
        'prompt_text': prompt,
        'input_ids': score['input_ids'].squeeze(),
        'attention_mask': score['attention_mask'].squeeze()
    }

In [None]:
train_dataset = PromptTemplet(prompts=train_prompts, tokenizer=tokenizer)
valid_dataset = PromptTemplet(prompts=valid_prompts, tokenizer=tokenizer)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        llm_int8_enable_fp32_cpu_offload=True,
    )

In [None]:
lora_config = LoraConfig(
        r=16,
        lora_alpha=8,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             trust_remote_code=True)

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [None]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 10092544 | total: 7625709056 | Percentage: 0.1323%


In [None]:
@dataclass
class TrainModel:
    output_dir: str
    run_name: str
    trainer_type: Callable
    model: Any
    tokenizer: Any
    train_data: Dataset
    valid_data: Dataset
    overwrite_output_dir: bool = True
    learning_rate: float = 0.00015  # 0.000015
    auto_find_batch_size: bool = True
    per_device_train_batch_size: int =32  # 16
    per_device_eval_batch_size: int = 16
    gradient_accumulation_steps: int = 2
    gradient_checkpointing: bool = True
    eval_accumulation_steps = 10
    warmup_steps: int = 20
    save_total_limit: int = 5
    num_train_epochs: int = 2
    weight_decay: float = 0.0001
    optim: str = "adamw_torch"  # "adamw_torch"  # "adamw_torch" 'lion_32bit'
    logging_steps = 1
    fp16: bool = True
    bf16: bool = True
    tf32: bool = True
    max_grad_norm: float = 0.3
    warmup_ratio: float = 0.03
    lr_scheduler_type: str = "cosine"
    eval_strategy: str = "steps"  # "epoch"
    save_strategy: str = "steps"  # "epoch"
    load_best_model_at_end: bool = True
    dataset_text_field: str = "prompt_text"
    metric_for_best_model: str = 'loss'
    push_to_hub: bool = False
    report_to = "wandb"

    @property
    def data_collator(self):
        return DataCollatorForLanguageModeling(self.tokenizer, mlm=False)

    @property
    def training_args(self):
        return TrainingArguments(
            output_dir=self.output_dir,
            run_name=self.run_name,
            overwrite_output_dir=self.overwrite_output_dir,
            learning_rate=self.learning_rate,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            auto_find_batch_size=self.auto_find_batch_size,
            per_device_train_batch_size=self.per_device_train_batch_size,
            per_device_eval_batch_size=self.per_device_eval_batch_size,
            eval_accumulation_steps=self.eval_accumulation_steps,
            # gradient_checkpointing=self.gradient_checkpointing,
            warmup_steps=self.warmup_steps,
            save_total_limit=self.save_total_limit,
            num_train_epochs=self.num_train_epochs,
            weight_decay=self.weight_decay,
            optim=self.optim,
            logging_steps=self.logging_steps,
            fp16=self.fp16,
            # bf16=self.bf16,
            # tf32=self.tf32,
            # max_grad_norm=self.max_grad_norm,
            # warmup_ratio=self.warmup_ratio,
            lr_scheduler_type=self.lr_scheduler_type,
            eval_strategy=self.eval_strategy,
            save_strategy=self.save_strategy,
            metric_for_best_model=self.metric_for_best_model,
            load_best_model_at_end=self.load_best_model_at_end,
            push_to_hub=self.push_to_hub,
            #report_to=self.report_to
        )

    @property
    def train(self):
        trainer_obj = SFTTrainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_data,
            eval_dataset=self.valid_data,
            tokenizer=self.tokenizer,
            data_collator=self.data_collator,
        )
        return trainer_obj.train()

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer = TrainModel(output_dir="fine_tune_qwen_2.5",
                     run_name="fine_tunning_qwen_2.5_eedi",
                     trainer_type=Trainer,
                     model=model,
                     tokenizer=tokenizer,
                     train_data=train_dataset,
                     valid_data=valid_dataset,
                    )

In [None]:
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
1,1.9005,1.881471
2,1.9542,1.881303
3,1.9015,1.880659
4,1.8633,1.879346
5,1.914,1.876922
6,1.8497,1.873215
7,1.9262,1.868224
8,1.9635,1.862104
9,1.8515,1.854452
10,1.9131,1.844866


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss,Validation Loss
1,1.2245,1.23412
2,1.2205,1.232921
3,1.228,1.230395
4,1.202,1.226397
5,1.2522,1.221235
6,1.2596,1.214796
7,1.1669,1.207389
8,1.2556,1.199196
9,1.2485,1.190207
10,1.1235,1.180769


TypeError: 'TrainOutput' object is not callable

In [None]:
trainer.model.save_pretrained("fine-tune-qwen-2.5-eedi")

In [21]:
torch.cuda.empty_cache()

In [22]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        llm_int8_enable_fp32_cpu_offload=True,
    )

In [23]:
base_model = "Qwen/Qwen2.5-7B"  # "microsoft/Phi-3.5-mini-instruct"
fine_tuned_model = "/content/fine-tune-qwen-2.5-eedi"  # "/content/fine-tune-phi-3.5-mini-instruct-eedi"

In [24]:
model_base = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, return_dict=True, device_map='auto')
# model_fine_tuned = AutoModelForCausalLM.from_pretrained(fine_tuned_model, quantization_config=bnb_config, return_dict=True, device_map='auto')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [25]:
# del model_fine_tuned
import gc
gc.collect()
torch.cuda.empty_cache()

In [26]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [27]:
# next we'll add our trained adapter to the model
model_base.load_adapter(fine_tuned_model, adapter_name="adapter")

In [28]:
# now using enable_adapters and disable_adapters we can choose
# if we want to run inference on the model itself or have it be
# influenced by our newly trained weights
model_base.enable_adapters()
# model.disable_adapters()

In [29]:
# also make sure we set the pad token, and for good measure turn off caching
model_base.config.pad_token_id = tokenizer.pad_token_id
model_base.config.use_cache = False
model_base.config.pad_token_id = tokenizer.eos_token_id

In [None]:
# model_fine_tuned.config.pad_token_id = tokenizer.pad_token_id
# model_fine_tuned.config.use_cache = False
# model_fine_tuned.config.pad_token_id = tokenizer.eos_token_id

In [30]:
model_base.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=3584, out_features=3584, bias=True)
            (lora_dropout): ModuleDict(
              (adapter): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (adapter): Linear(in_features=3584, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (adapter): Linear(in_features=16, out_features=3584, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=3584, out_features=512, bias=True)
            (lora_dropout): ModuleDict(
     

In [None]:
# model_fine_tuned.eval()

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=3072, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=3072, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb)

In [None]:
len(valid_prompts)

437

In [68]:
# Load a pretrained CrossEncoder model
cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2", default_activation_function=torch.nn.Sigmoid())



In [48]:
list_of_misconseption = misconseption["MisconceptionName"].to_list()

In [173]:
def create_misconception_prompt(prompt: str) -> str:
  txt_to_delete = re.findall("<misconception>\n(.*\n</misconception>)", prompt)[0]
  p=re.sub(txt_to_delete, "", prompt)
  return p

In [218]:
def generate_misconception(misconception_prompt: str,
                           tokenizer: AutoTokenizer,
                           temperature: float = 0.7,
                           max_new_tokens: int = 30,
                           num_retries: int = 5) -> list[str]:
  device = "cuda" if torch.cuda.is_available() else "cpu"
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
  inputs = tokenizer(misconception_prompt, return_tensors="pt").to(device)
  predicted_misconceptions = []
  for _ in range(num_retries):
    generated_ids_model_base = model_base.generate(**inputs,
                                                  temperature=temperature,
                                                  streamer=streamer,
                                                  max_new_tokens=max_new_tokens,
                                                  do_sample=True,
                                                  pad_token_id=tokenizer.eos_token_id)
    decoded_model_base = tokenizer.batch_decode(generated_ids_model_base)
    try:
      predicted_misconception = re.findall("<mis.*>\n(.*)\n</mis.*>?", decoded_model_base[0])[0]
      predicted_misconceptions.append(predicted_misconception)
    except Exception as e:
      print(f"Error: {e}")
      predicted_misconceptions.append(None)
  return predicted_misconceptions

In [147]:
def extract_similar_misconception(predicted_misconceptions: str, list_of_misconseption: list[str], cross_encoder_model: Any) -> list[int]:
  """
  the functions recieves at most three generated misconceptions.
  It then generated list of three ranked similar misconceptions.
  I aggregate the misconceptions into a single list and rank the misconceptions by their frequency
  in descending order
  """
  general_corpuses = []
  for misconception in predicted_misconceptions:
    ranks = cross_encoder_model.rank(misconception, list_of_misconseption)
    list_of_corpuses = []
    for corpus in ranks[:100]:
      list_of_corpuses.append(corpus["corpus_id"])
    general_corpuses.extend(list_of_corpuses)
  misconceptions_frequencies = Counter(general_corpuses).most_common()
  top_misconceptions = [id for id, _ in misconceptions_frequencies[:25]]
  return top_misconceptions


In [254]:
number=20
misconception_prompt = create_misconception_prompt(valid_prompts[number])

In [255]:
predicted_misconceptions = generate_misconception(misconception_prompt, tokenizer)








In [256]:
similar_misconception = extract_similar_misconception(predicted_misconceptions, list_of_misconseption, cross_encoder_model)

In [257]:
print(similar_misconception)

[47, 1412, 1303, 1654, 594, 1475, 2465, 1949, 1541, 283, 101, 1584, 563, 2014, 2380, 422, 1889, 362, 2497, 1234, 1312, 165, 1893, 866, 2083]


In [258]:
original_misconception = re.findall("<misconception>\n(.*)\n</misconception>", valid_prompts[number])[0]
print(original_misconception)

Believes y=-f(x) is a reflection in y=x


In [259]:
# locate the index place of the original misconception in the list
index_of_original_misconception = [i for i, x in enumerate(list_of_misconseption) if x == original_misconception]
print(index_of_original_misconception)

[47]
