In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
formdata = {
    "name": True,
    "number": True,
    "adress": True
}


In [6]:
resumeparsed_data = {
    "Name": "John Doe",
    "Phone": "+1 (620) 130-7224",
    "Address": "447 Sutter St 3rd Floor, San Francisco, CA 94108, United States",
    "Role": "Software Developer"
}


In [7]:
def generate_from_data(formdata, userdata):
    requested_fields = ', '.join([key for key, value in formdata.items() if value is True])


    prompt = f"""Extract the following fields from this context: {requested_fields}

    Context: "{userdata}"

    Return ONLY a simple JSON object with these fields: {requested_fields}
    If a field isn't found, leave it empty/null.

    Example format:
    {{
        "Name": "John Doe",
        "Phone": "+1234567890"
    }}"""

    messages = [
        {
            "role": "system",
            "content": " You are an AI assistant helping to extract specific data fields from a document.. Extract specific fields from the given context as JSON."
        },
        {
            "role": "user",
            "content": prompt
        }
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
        temperature=0.1
    )

    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids
        in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [8]:
sample_response=generate_from_data(formdata, resumeparsed_data)

In [9]:
sample_response

'```json\n{\n    "Name": "John Doe",\n    "Number": null,\n    "Adress": "447 Sutter St 3rd Floor, San Francisco, CA 94108, United States"\n}\n```'