# **Install needed Library**

In [None]:
!pip install transformers datasets peft accelerate bitsandbytes


In [None]:
pip install nltk rouge-score

# Cell 1

# Cell 2

In [None]:
!pip install datasets


In [None]:
from huggingface_hub import login

login(token="....")


In [None]:
from datasets import Dataset
import json


with open("output_unique_5000.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

formatted = []
for item in raw_data:
    question = item.get("question", "").strip()
    answer = item.get("answer", "")
    if isinstance(answer, list):
        answer = "\n".join(answer)
    elif isinstance(answer, dict):
        answer = "\n".join(f"{k}: {', '.join(v) if isinstance(v, list) else v}" for k, v in answer.items())
    formatted.append({"question": question, "answer": answer})

ds = Dataset.from_list(formatted)
ds_dict = ds.train_test_split(test_size=0.1)

ds_dict.push_to_hub("Yasser18/QA_clean", private=True)


Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 46.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 106.21ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.24s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Yasser18/QA_clean/commit/7875a0e1eb42c302c5c6ec969762bfeaf639e338', commit_message='Upload dataset', commit_description='', oid='7875a0e1eb42c302c5c6ec969762bfeaf639e338', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Yasser18/QA_clean', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Yasser18/QA_clean'), pr_revision=None, pr_num=None)

# Cell 3

# Cell 4

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

# BitsAndBytes 4bit quantization setup
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",  # safer than hardcoding {“”: 0}
    quantization_config=bnb_config
)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)


# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

# Cell 5

In [None]:
# Preprocess function for question-answer format
def preprocess(example):
    prompt = f"### Question:\n{example['question']}\n\n### Answer:\n{example['answer']}"

    tokenized = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    tokenized["labels"] = tokenized["input_ids"].clone()
    tokenized["labels"][tokenized["attention_mask"] == 0] = -100  # mask out padding tokens

    return {key: val.squeeze(0) for key, val in tokenized.items()}

# Apply preprocessing to dataset
tokenized_dataset = dataset.map(preprocess)


# Cell 6

In [None]:
def tokenize(example):
    return tokenizer(
        f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}",
        padding="max_length",
        truncation=True,
        max_length=512
    )
tokenized_dataset = dataset.map(preprocess)


# Cell 7

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    fp16=True,
    learning_rate=2e-4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer
)

trainer.train()

# Cell 8

In [None]:
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")
print("Model saved to ./final_model")

# Cell 9

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="./final_model", tokenizer="./final_model", device=0)

prompt = "What should I know about using the APS soldering tool?"
generated = pipe(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
print("Generated Response:\n", generated[0]['generated_text'])

# Cell 9

In [None]:
import nltk
nltk.download('punkt')

# Cell 10 - Bleu & Rouge

In [None]:
# Cell 10 – Evaluation with BLEU and ROUGE
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

reference = "You should follow the APS soldering tool safety steps and perform maintenance every 6 months."
candidate = generated[0]['generated_text']

# BLEU Score (with smoothing)
smoothie = SmoothingFunction().method4
bleu_score = sentence_bleu([reference.split()], candidate.split(), smoothing_function=smoothie)
print("BLEU Score with smoothing:", bleu_score)

# ROUGE Score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, candidate)

print("\nROUGE Scores:")
for k, v in scores.items():
    print(f"{k}: Precision={v.precision:.4f}, Recall={v.recall:.4f}, F1={v.fmeasure:.4f}")

In [None]:
import shutil

shutil.make_archive("final_model", 'zip', "./final_model")
print("Zipped model as final_model.zip")


In [None]:
from google.colab import files
files.download("final_model.zip")
