<a href="https://colab.research.google.com/github/SaatvikP/ApproachCraft-AI/blob/main/ApproachCraft_AI_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install transformers datasets accelerate peft trl bitsandbytes

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12

In [None]:
from datasets import load_dataset, DatasetDict

# Load full 2000-sample JSONL dataset
full_dataset = load_dataset("json", data_files="ai_edu_dataset.jsonl", split="train")

# 90% train, 10% test
split_dataset = full_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    full_text = f"### Research Problem:\n{example['prompt']}\n\n### Suggested Approach:\n{example['response']}"
    model_inputs = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_train = train_dataset.map(tokenize, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(tokenize, remove_columns=test_dataset.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Map:   0%|          | 0/1704 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./finetuned-phi2-ai-edu",
    run_name="phi2-2000-run",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="no",
    remove_unused_columns=False,
    label_names=["labels"]
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
10,2.6075
20,2.4084
30,2.3857
40,2.3358
50,2.3266
60,2.3316
70,2.3178
80,2.3535
90,2.3454
100,2.3584


TrainOutput(global_step=318, training_loss=2.320936412931238, metrics={'train_runtime': 368.4922, 'train_samples_per_second': 13.873, 'train_steps_per_second': 0.863, 'total_flos': 4.130935356260352e+16, 'train_loss': 2.320936412931238, 'epoch': 2.976525821596244})

In [None]:
test_prompts = [f"### Research Problem:\n{ex['prompt']}\n\n### Suggested Approach:\n" for ex in test_dataset]
gold_responses = [ex['response'] for ex in test_dataset]

from tqdm import tqdm
import json

results = []

for prompt, gold in tqdm(zip(test_prompts, gold_responses), total=len(test_prompts)):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.7,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    prediction = decoded.replace(prompt, "").strip()

    results.append({
        "prompt": prompt,
        "gold_response": gold,
        "generated_output": prediction
    })

# Save the structured results as a JSON file
with open("evaluation_data.json", "w") as f:
    json.dump(results, f, indent=2)


100%|██████████| 190/190 [50:00<00:00, 15.79s/it]


In [None]:
import json

with open("evaluation_data.json", "r") as f:
    results = json.load(f)

gold = [r["gold_response"] for r in results]
pred = [r["generated_output"] for r in results]


In [None]:
from rouge import Rouge

rouge = Rouge()
scores = rouge.get_scores(pred, gold, avg=True)

print("ROUGE Scores:")
print(scores)


ROUGE Scores:
{'rouge-1': {'r': 0.2747847616390678, 'p': 0.2895234286608311, 'f': 0.2778841237928774}, 'rouge-2': {'r': 0.06310624172557834, 'p': 0.060392484224364294, 'f': 0.06065719675958528}, 'rouge-l': {'r': 0.2538399640968369, 'p': 0.26787507842225267, 'f': 0.2569339828489636}}


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')

gold_embeddings = model.encode(gold)
pred_embeddings = model.encode(pred)

similarities = cosine_similarity(gold_embeddings, pred_embeddings)
avg_similarity = similarities.diagonal().mean()

print(f"Average Semantic Cosine Similarity: {avg_similarity:.4f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Average Semantic Cosine Similarity: 0.7199
