In [13]:

import pandas as pd

# تحميل الملف
df = pd.read_csv(r"C:\Users\zaszs\Desktop\New folder (2)\archive\UpdatedResumeDataSet.csv")

# عرض أسماء الأعمدة
print(df.columns)


Index(['Category', 'Resume'], dtype='object')


In [14]:
import re

# تنظيف النصوص
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    return text.lower().strip()

# تنظيف عمود السيرة الذاتية
df["cleaned_resume"] = df["Resume"].apply(clean_text)
df["Category"] = df["Category"].apply(str.strip)

# استعلامات المستخدم مع الفئة المطابقة
queries = {
    "python developer with experience in machine learning": "Python Developer",
    "web designer skilled in HTML and CSS": "Web Designing",
    "expert in HR and recruitment": "HR"
}

# تجهيز بيانات التعليم (instruction → output)
instruction_data = []
for query, label in queries.items():
    matched = df[df["Category"] == label].sample(1).iloc[0]
    instruction_data.append({
        "instruction": query,
        "output": matched["cleaned_resume"]
    })

# تحويل لبيانات تدريبية
from sklearn.model_selection import train_test_split
df_instruction = pd.DataFrame(instruction_data)
train_df, val_df = train_test_split(df_instruction, test_size=0.2)


In [16]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset

# تحميل النموذج والـ tokenizer
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# تطبيق LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model, peft_config)

# تحميل البيانات بتنسيق HuggingFace
dataset = load_dataset("json", data_files={"train": "train.json", "validation": "val.json"})

# ترميز البيانات
def tokenize(example):
    inputs = tokenizer(example["instruction"], padding="max_length", truncation=True, max_length=256)
    targets = tokenizer(example["output"], padding="max_length", truncation=True, max_length=256)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True)

# تجميع البيانات
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# إعدادات التدريب
training_args = Seq2SeqTrainingArguments(
    output_dir="./lora_finetuned_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False  # اجعله False إذا لم يكن لديك GPU يدعم الـ float16
)


# تدريب النموذج
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


Generating train split: 2 examples [00:00, 20.07 examples/s]
Generating validation split: 1 examples [00:00, 99.99 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 33.64 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 39.87 examples/s]
  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


TrainOutput(global_step=5, training_loss=20.46027374267578, metrics={'train_runtime': 35.5027, 'train_samples_per_second': 0.282, 'train_steps_per_second': 0.141, 'total_flos': 3437376307200.0, 'train_loss': 20.46027374267578, 'epoch': 5.0})

In [15]:
import json

def to_lora_format(df):
    return [
        {"instruction": row["instruction"], "input": "", "output": row["output"]}
        for _, row in df.iterrows()
    ]

with open("train.json", "w") as f:
    json.dump(to_lora_format(train_df), f, indent=2)

with open("val.json", "w") as f:
    json.dump(to_lora_format(val_df), f, indent=2)


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

# تحميل النموذج والـ tokenizer
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, r"C:\Users\zaszs\Desktop\New folder (2)\lora_finetuned_model\checkpoint-5")

def search_resume(query, max_new_tokens=128):
    inputs = tokenizer(query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# مثال استعلام
query = "looking for a frontend developer who knows React and JavaScript"
result = search_resume(query)
print(result)


i need a frontend developer who knows React and JavaScript


In [10]:
import json

queries = [
    "python developer with Django experience",
    "data scientist skilled in machine learning and SQL",
    "marketing specialist fluent in social media"
]

results = []

for q in queries:
    response = search_resume(q)
    results.append({
        "query": q,
        "matched_resume_summary": response
    })

# حفظ النتائج
with open("search_results.json", "w") as f:
    json.dump(results, f, indent=2)
