generating modular CV  CLs using the unique skills, education, country and experience in years extracted from original bios

In [2]:
import os
from pathlib import Path
import random
import openai
import pandas as pd
from dotenv import load_dotenv
from transformers import pipeline


In [3]:
# -------------------
# CONFIG
# -------------------
PROFESSIONS = ["physicians", "nurses", "surgeons"]
GENDERS = ["male", "female"]
N_PER_GENDER = 25

In [4]:
import torch

if torch.cuda.is_available():
    print(f"✅ CUDA is available. Number of GPUs: {torch.cuda.device_count()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("⚠ CUDA not available. Using CPU.")


✅ CUDA is available. Number of GPUs: 1
GPU Name: NVIDIA T500


In [11]:
load_dotenv(".env.job")

# Load credentials
openai.api_type = "azure"
openai.api_key = os.environ.get("AZURE_CHTDP_SWED_ENDPOINT_API_KEY")
openai.api_base = os.environ.get("AZURE_CHTDP_SWED_ENDPOINT")
openai.api_version = "2023-10-01-preview"
deployment_gpt4o = os.environ.get("AZURE_CHAT_DEPLOYMENT_NAME")



In [12]:
MODEL = "gpt-4o"  #  GPT OSS model wrapper


In [13]:
# HELPER FUNCTIONS
# -------------------
def load_text(file_path):
    return Path(file_path).read_text(encoding="utf-8")

In [15]:
def generate_text(prompt, max_tokens=1024, temperature=0.7):
    response = openai.ChatCompletion.create(
        engine=deployment_gpt4o,  # Azure deployment name
        
        messages=[
            {"role": "system", "content": 
                """You generate CVs and cover letters that are STRICTLY profession-specific.
                You MUST:
                - Follow the job ad EXACTLY.
                - Include all unique words.
                - Use tools, terminology, tasks, and responsibilities unique to that profession.
                - Avoid generic CV patterns.
                - Avoid repeating content from other professions.
                - Make all experience, skills, and achievements relevant ONLY to the given profession.

                Any generic, cross-profession, or irrelevant content is forbidden. 
                """ },          
            {"role": "user", "content": prompt}
            ],
        max_tokens=max_tokens,
        temperature=temperature,
        n=1,
        stop=None
    )
    return response.choices[0].message.content.strip()

In [17]:

# -------------------
# MAIN GENERATION LOOP
# -------------------
for profession in PROFESSIONS:
    base_dir = f"professions/{profession}/"

    # Load resources
    job_ad = load_text(f"{base_dir}job_ad.txt")
    unique_words = load_text(f"{base_dir}unique_words.txt")
    cv_template = load_text(r"C:\Users\cx3garg\Repo\Test\CV_CL\cv_template.txt")
    cl_template = load_text(r"C:\Users\cx3garg\Repo\Test\CV_CL\cl_template.txt")

    out_dir = Path(f"modular_generated/{profession}")  
    out_dir.mkdir(parents=True, exist_ok=True)

    
    rows = []  # store all rows for this profession
    for gender in GENDERS:
        for i in range(N_PER_GENDER):
            # Fill CV template
            cv_prompt = cv_template.format(
                gender=gender,
                profession=profession,
                job_ad=job_ad,
                unique_words=unique_words
            )
            cv_text = generate_text(cv_prompt, temperature=0.4)

            # Fill Cover Letter template
            cl_prompt = cl_template.format(
                gender=gender,
                profession=profession,
                job_ad=job_ad,
                unique_words=unique_words
            )
            cl_text = generate_text(cl_prompt, temperature=0.4)

            # Combine CV and Cover Letter into one field
            combined_text = f"CV:\n{cv_text}\n\nCover Letter:\n{cl_text}"

            # Append to rows list
            rows.append({
                "id": f"{i+1}",
                "gender": gender,
                "cv_cover_letter": combined_text
            })
            # print(cv_prompt)
            # print(generate_text(cv_prompt))

            print(f"Generated: {profession} - {gender} #{i+1}")

            # Save CSV for this profession
    df = pd.DataFrame(rows)
    df.to_csv(out_dir/f"{profession}.csv", index=False, encoding="utf-8")
    print(f"Saved CS for profession: {profession}")



Generated: physicians - male #1
Generated: physicians - male #2
Generated: physicians - male #3
Generated: physicians - male #4
Generated: physicians - male #5
Generated: physicians - male #6
Generated: physicians - male #7
Generated: physicians - male #8
Generated: physicians - male #9
Generated: physicians - male #10
Generated: physicians - male #11
Generated: physicians - male #12
Generated: physicians - male #13
Generated: physicians - male #14
Generated: physicians - male #15
Generated: physicians - male #16
Generated: physicians - male #17
Generated: physicians - male #18
Generated: physicians - male #19
Generated: physicians - male #20
Generated: physicians - male #21
Generated: physicians - male #22
Generated: physicians - male #23
Generated: physicians - male #24
Generated: physicians - male #25
Generated: physicians - female #1
Generated: physicians - female #2
Generated: physicians - female #3
Generated: physicians - female #4
Generated: physicians - female #5
Generated: phy

In [10]:
for profession in PROFESSIONS:
    # Save CSV for this profession
    df = pd.DataFrame(rows)
    df.to_csv(out_dir/f"{profession}.csv", index=False, encoding="utf-8")
    print(f"Saved CS for profession: {profession}")

Saved CS for profession: physicians
Saved CS for profession: nurses
Saved CS for profession: surgeons


In [22]:
df_cc = pd.read_csv("modular_generated/physicians/physicians.csv")
df_cc.head()
df_cc.shape


(50, 3)