In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0))

In [None]:
!pip install transformers==4.44.0
!pip install peft==0.13.0
!pip install accelerate==0.33.0
!pip install trl==0.10.1
!pip install bitsandbytes==0.43.1
!pip install datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from transformers import TrainingArguments
import json
from datasets import Dataset

print("CUDA:", torch.cuda.is_available(), torch.cuda.get_device_name(0))

# 1) Load base model (GPU + 4-bit quantization)
BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
)

# 2) Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# 3) Load training data
SEED_PATH = r"C:\Users\Junsoo_Hyun\Jupiter\FYP\Model\skill_extraction_llama32_1B_synthetic_filtered_v2.jsonl"

records = []
with open(SEED_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            records.append(json.loads(line))

dataset = Dataset.from_list(records)

def format_example(ex):
    text = (
        f"{ex['instruction']}\n\n"
        f"Student text:\n{ex['input']}\n\n"
        f"JSON: {ex['output']}"
    )
    return [text]

# 4) Training configuration
OUTPUT_DIR = r"C:\Users\Junsoo_Hyun\Jupiter\FYP\Model\skill-extractor-1B"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=1,
    save_strategy="epoch",
    fp16=True,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    formatting_func=format_example,
)

trainer.train()

trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("GPU QLoRA fine-tuning complete.")

In [None]:
# skill_extractor_v1.py
import torch
import json
import ast
import difflib
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
ADAPTER_DIR = r"C:\Users\Junsoo_Hyun\Jupiter\FYP\Model\skill-extractor-1B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print("CUDA:", torch.cuda.is_available(), torch.cuda.get_device_name(0))

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map={"": "cuda"},
)

model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()

skills_df = pd.read_csv("skills_master.csv")
skills_df["skill_name"] = skills_df["skill_name"].astype(str)
master_names = skills_df["skill_name"].tolist()
master_names_lower = [n.lower() for n in master_names]
lower_to_canonical = {n.lower(): n for n in master_names}

def parse_model_output(text: str):
    s = text.strip()
    for parser in (json.loads, ast.literal_eval):
        try:
            obj = parser(s)
            if isinstance(obj, list):
                if all(isinstance(x, dict) for x in obj):
                    return obj
                if all(isinstance(x, str) for x in obj):
                    out = []
                    for name in obj:
                        out.append({"skill_name": name, "explanation": ""})
                    return out
        except Exception:
            continue
    objs = []
    n = len(s)
    i = 0
    while i < n:
        if s[i] == "{":
            depth = 0
            start = i
            end = None
            for j in range(i, n):
                if s[j] == "{":
                    depth += 1
                elif s[j] == "}":
                    depth -= 1
                    if depth == 0:
                        end = j
                        break
            if end is None:
                break
            chunk = s[start:end+1]
            try:
                obj = json.loads(chunk)
                if isinstance(obj, dict):
                    objs.append(obj)
            except Exception:
                pass
            i = end + 1
        else:
            i += 1
    return objs

def map_to_master(name: str, cutoff: float = 0.6):
    if not name:
        return None
    key = str(name).strip().lower()
    if not key:
        return None
    if key in lower_to_canonical:
        return lower_to_canonical[key]
    match = difflib.get_close_matches(key, master_names_lower, n=1, cutoff=cutoff)
    if not match:
        return None
    return lower_to_canonical[match[0]]

def extract_skills(student_text: str) -> str:
    instruction = "Extract relevant skills from the student text and return a JSON array with fields: skill_name and explanation."
    prompt = instruction + "\n\nStudent text:\n" + student_text + "\n\nJSON: "
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            temperature=1.0,
            top_p=1.0,
        )
    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    completion = tokenizer.decode(gen_ids, skip_special_tokens=True)
    raw_list = parse_model_output(completion)
    if not raw_list:
        print("RAW COMPLETION (for debug):", completion)
    mapped = []
    seen = set()
    for item in raw_list:
        if isinstance(item, dict):
            raw_name = item.get("skill_name", "")
            explanation = str(item.get("explanation", "")).strip()
        else:
            raw_name = str(item)
            explanation = ""
        canonical = map_to_master(raw_name)
        if not canonical:
            continue
        if not explanation:
            explanation = "This skill is relevant to the described activities in the student text."
        if canonical in seen:
            continue
        seen.add(canonical)
        row = skills_df.loc[skills_df["skill_name"] == canonical]
        skill_id = None
        if not row.empty and "skill_id" in row.columns:
            skill_id = str(row.iloc[0]["skill_id"])
        out_obj = {"skill_name": canonical, "explanation": explanation}
        if skill_id is not None:
            out_obj["skill_id"] = skill_id
        mapped.append(out_obj)
    return json.dumps(mapped, ensure_ascii=False)

test_text = (
    "Since childhood, I have enjoyed solving logic puzzles and brain teasers. "
    "I often spend weekends working through puzzle books and online logic games, "
    "trying to find patterns and efficient solutions."
)

print(extract_skills(test_text))


In [None]:
# skill_extractor_final.py
import torch
import json
import difflib
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
ADAPTER_DIR = r"C:\Users\Junsoo_Hyun\Jupiter\FYP\Model\skill-extractor-1B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print("CUDA:", torch.cuda.is_available(), torch.cuda.get_device_name(0))

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map={"": "cuda"},
)

model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()

skills_df = pd.read_csv("skills_master.csv")
skills_df["skill_name"] = skills_df["skill_name"].astype(str)

if "skill_description" in skills_df.columns:
    skills_df["skill_description"] = skills_df["skill_description"].fillna("").astype(str)
else:
    skills_df["skill_description"] = ""

skills_df["text_for_tfidf"] = skills_df["skill_name"] + " " + skills_df["skill_description"]

# Master skill names (used to filter final outputs)
master_names = skills_df["skill_name"].tolist()

vectorizer = TfidfVectorizer(stop_words="english", max_features=20000)
skill_tfidf = vectorizer.fit_transform(skills_df["text_for_tfidf"])

def get_candidate_indices(student_text: str, top_k: int = 60, min_score: float = 0.02):
    vec = vectorizer.transform([student_text])
    scores = cosine_similarity(vec, skill_tfidf)[0]
    idx_sorted = scores.argsort()[::-1]
    selected = []
    for i in idx_sorted:
        if scores[i] < min_score:
            break
        selected.append(i)
        if len(selected) >= top_k:
            break
    if not selected:
        selected = idx_sorted[:top_k]
    return selected, scores

def normalize_json_array(text: str):
    s = text.strip()
    objs = []
    n = len(s)
    i = 0
    while i < n:
        if s[i] == "{":
            depth = 0
            start = i
            end = None
            for j in range(i, n):
                if s[j] == "{":
                    depth += 1
                elif s[j] == "}":
                    depth -= 1
                    if depth == 0:
                        end = j
                        break
            if end is None:
                break
            chunk = s[start:end+1]
            try:
                obj = json.loads(chunk)
                if isinstance(obj, dict):
                    objs.append(obj)
            except Exception:
                pass
            i = end + 1
        else:
            i += 1
    return objs

def map_to_candidate(name: str, cand_names_lower, lower_to_cand, cutoff: float = 0.6):
    if not name:
        return None
    key = str(name).strip().lower()
    if not key:
        return None
    if key in lower_to_cand:
        return lower_to_cand[key]
    match = difflib.get_close_matches(key, cand_names_lower, n=1, cutoff=cutoff)
    if not match:
        return None
    return lower_to_cand[match[0]]

def extract_skills(
    student_text: str,
    top_k_candidates: int = 60,
    min_candidate_score: float = 0.02,
    min_skill_score: float = 0.05,
    max_return_skills: int = 5,
) -> str:
    idx, scores = get_candidate_indices(student_text, top_k=top_k_candidates, min_score=min_candidate_score)
    cand_df = skills_df.iloc[idx].reset_index(drop=True)

    cand_names = cand_df["skill_name"].tolist()
    cand_names_lower = [n.lower() for n in cand_names]
    lower_to_cand = {n.lower(): n for n in cand_names}
    cand_scores = scores[idx]

    score_by_name = {}
    for name, sc in zip(cand_names, cand_scores):
        if name not in score_by_name or sc > score_by_name[name]:
            score_by_name[name] = sc

    instruction = "Extract relevant skills from the student text and return a JSON array with fields: skill_name and explanation."
    skills_list_str = "\n".join(f"- {name}" for name in cand_names)

    prompt = (
        instruction
        + "\nReturn at most "
        + str(max_return_skills)
        + " skills. Only include skills that are clearly demonstrated in the student text. You must choose skill_name values only from the allowed list.\n\nAllowed skill_name values:\n"
        + skills_list_str
        + "\n\nStudent text:\n"
        + student_text
        + "\n\nJSON: "
)

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
)

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    completion = tokenizer.decode(gen_ids, skip_special_tokens=True)
    raw_list = normalize_json_array(completion)

    if not raw_list:
        print("RAW COMPLETION (debug):", completion)

    mapped = []
    seen = set()
    for item in raw_list:
        if not isinstance(item, dict):
            continue

        raw_name = item.get("skill_name", "")
        explanation = str(item.get("explanation", "")).strip()

        canonical = map_to_candidate(raw_name, cand_names_lower, lower_to_cand)
        if not canonical:
            continue
        if not explanation:
            continue

        # Keep only skills that exist in the master list
        if canonical not in master_names:
            continue

        sc = score_by_name.get(canonical, 0.0)
        if sc < min_skill_score:
            continue

        if canonical in seen:
            continue
        seen.add(canonical)

        row = cand_df.loc[cand_df["skill_name"] == canonical]
        skill_id = None
        if not row.empty and "skill_id" in row.columns:
            skill_id = str(row.iloc[0]["skill_id"])

        out_obj = {"skill_name": canonical, "explanation": explanation}
        if skill_id is not None:
            out_obj["skill_id"] = skill_id

        mapped.append(out_obj)

        if len(mapped) >= max_return_skills:
            break

    return json.dumps(mapped, ensure_ascii=False)

test_text = (
    "Since childhood, I have enjoyed solving logic puzzles and brain teasers. "
    "I often spend weekends working through puzzle books and online logic games, "
    "trying to find patterns and efficient solutions."
)

print(extract_skills(test_text))