In [None]:
import pandas as pd
import torch
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load the model meta-llama
model_id = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.float16)


# Load the CSV file containing sample bios
bio_df = pd.read_csv("accountant_samples.csv") #path to Bios sample 

# Read the job ad prompt from a text file
with open("prompt_template_jobAD.txt", "r") as f:  #path to job ad prompt text file 
    job_ad_prompt = f.read()

# Load the LLaMA model with text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
   # device=0 if torch.cuda.is_available() else -1  
)

# --- Generate Outputs ---
def format_prompt(template, bio, profession, gender):
    return f"<s>[INST] {template.format(bio=bio, gender=gender, profession=profession)} [/INST]"      #The Llama-2-7b-chat-hf model uses a chat template, so wrap your prompts in [INST] ... [/INST]. 



In [None]:
# Generate text for each bio (max_new_tokens=512))
generated_texts = []
for i, row in bio_df.iterrows():  
    bio = row["hard_text"]      # bio text column
    profession = row["profession"]
    gender = row["gender"] 
    prompt = format_prompt(job_ad_prompt, bio, profession, gender)
    
    response = pipe(prompt, max_new_tokens=512, temperature=0.0, do_sample=False)[0]["generated_text"]
    generated_texts.append({
        "profession": profession,
        "bio": bio,
        "generated_text": response
    })
    print(f"\n--- Result for Bio #{i+1} ---\n{response}\n")


# Create the subfolder if it doesn't exist
os.makedirs("job_ads/jobADs_accountant", exist_ok=True)

# Save each job ad as a separate file inside the folder
for i, ad in enumerate(generated_texts, start=1):
    filename = f"job_ads/jobADs_accountant/job_ad_{i}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        
            f.write(f"\n{ad['generated_text']}\n")
            f.write("\n" + "-"*80 + "\n\n")

In [None]:
# Generate text for each bio (max_new_tokens=2048)
generated_texts = []
for i, row in bio_df.iterrows():  
    bio = row["hard_text"]      # bio text column
    profession = row["profession"]
    gender = row["gender"] 
    prompt = format_prompt(job_ad_prompt, bio, profession, gender)
    
    response = pipe(prompt, max_new_tokens=2048, temperature=0.0, do_sample=False)[0]["generated_text"]
    generated_texts.append({
        "profession": profession,
        "bio": bio,
        "generated_text": response
    })
    print(f"\n--- Result for Bio #{i+1} ---\n{response}\n")


# Create the subfolder if it doesn't exist
os.makedirs("job_ads/jobADs_len2048", exist_ok=True)

# Save each job ad as a separate file inside the folder
for i, ad in enumerate(generated_texts, start=1):
    filename = f"job_ads/jobADs_len2048/job_ad_{i}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        
            f.write(f"\n{ad['generated_text']}\n")
            f.write("\n" + "-"*80 + "\n\n")