In [1]:
#for loading data
import pandas as pd
import json

#for llm
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel

#similarity
import regex as re
from sentence_transformers import SentenceTransformer
from scipy.optimize import linear_sum_assignment
import numpy as np

#counting
from tqdm import tqdm

from datetime import datetime
import os
from concurrent.futures import ThreadPoolExecutor

In [8]:
# Generate the folder name with current date and time
folder_name = 'results/task_match_'+datetime.now().strftime("%d%m_%H%M")+"/"

# Create the folder if it does not exist
os.makedirs(folder_name, exist_ok=True)

### Preprocess data and sampling

In [2]:
# read dataset and drop columns
job_statements = pd.read_excel("datasets/task_statements.xlsx")
job_statements.columns = job_statements.columns.str.lower()
job_statements = job_statements.drop(labels=["incumbents responding","date","domain source"], axis=1).rename(columns={"o*net-soc code":"code", "task type":"type", "task id": "id", "task":"ref_task"})
job_statements = job_statements[job_statements["type"].notna()]
job_statements["ind"] = job_statements["code"].str[:2]
job_statements = job_statements.groupby("title").agg({"ref_task":list, "ind": "first"}).reset_index().sort_values("ind")
job_statements
print(len(job_statements))
sampled_occupation = job_statements.groupby('ind', group_keys=False).sample(frac=0.05, random_state=1) #43 samples
sampled_occupation
print(len(sampled_occupation))


879
43


In [5]:
# Save 3 rows of the sampled_occupation DataFrame as JSON
sampled_occupation.head(3).to_json("sampled_occupation_sample.json", orient="records", indent=4)

In [7]:

sampled_occupation

Unnamed: 0,title,ref_task,ind
478,Lodging Managers,[Answer inquiries pertaining to hotel policies...,11
778,Spa Managers,"[Respond to customer inquiries or complaints.,...",11
834,Training and Development Managers,[Analyze training needs to develop new trainin...,11
203,Customs Brokers,[Prepare and process import and export documen...,13
386,Government Property Inspectors and Investigators,"[Prepare correspondence, reports of inspection...",13
857,Video Game Designers,[Balance and adjust gameplay experiences to en...,15
559,Nanosystems Engineers,[Provide scientific or technical guidance or e...,17
430,Industrial Engineers,"[Estimate production costs, cost saving method...",17
800,Surveying and Mapping Technicians,"[Position and hold the vertical rods, or targe...",17
75,Biochemists and Biophysicists,[Share research findings by writing scientific...,19


In [5]:
#for trial
trial_df = sampled_occupation.sample(1, random_state= 1)
test_sample_list =[trial_df.iloc[x]["title"] for x in range(len(trial_df))]
test_sample_list

['Customs Brokers']

### Set up functions

In [6]:
#get reference description
def get_des (title):
    task_list = sampled_occupation.query("title == @title")["ref_task"].iloc[0]
    return task_list

In [7]:
#invoke llm to generate tasks
def task_gen(title, model, system = None):
    json_schema = {
        "type": "object",
        "properties": {
            "occupation": {
                "type": "string"
            },
            "tasks": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "minItems": len(get_des(title)),
                "maxItems": len(get_des(title))
            }
        },
        "required": ["occupation", "tasks"]
    }

    #initialize model

    query = "List out exactly "+str(len(get_des(title)))+" task statements that the occupation \""+ title +"\" would perform at work.Make sure each statement is unique and different from one another."

    if system == None:
        prompt_template = ChatPromptTemplate([
            ("human","{input}")
            ]
        )
    else:
        prompt_template = ChatPromptTemplate([
            ("system", system),
            ("human","{input}")
            ]
        )

    llm = model.with_structured_output(schema=json_schema, method="json_schema")

    prompt = prompt_template.invoke({"input": query, "title": title})
    # keep running until the number of parsed tasks is equal to the number of reference tasks
    for i in range (3):
        response = llm.invoke(prompt)
        #parse response
        try:
            parsed = json.loads(response["tasks"])
            print('parsed json')
        except:
            print('not json')
            try:
               parsed = response["tasks"]
               print('parsed string')
            except:
                print('not string')
                continue
        try:
            if len(parsed) == len(get_des(title)):
                return parsed
            else:
                print('not equal, parsed:', len(parsed), 'ref:', len(get_des(title)))
        except Exception as e:
            #try 3 more times, and if it still fails, return the parsed
            print(e)
            continue
        
    

In [46]:
#pre process text
def preProcessText(text=list):
	processed = []
	for doc in text:
		doc = re.sub(r"\\n", "", doc)
		doc = re.sub(r"\W", " ", doc) #remove non words char
		doc = re.sub(r"\d"," ", doc) #remove digits char
		doc = re.sub(r'\s+[a-z]\s+', " ", doc) # remove a single char
		doc = re.sub(r'^[a-z]\s+', "", doc) #remove a single character at the start of a document
		doc = re.sub(r'\s+', " ", doc)  #replace an extra space with a single space
		doc = re.sub(r'^\s', "", doc) # remove space at the start of a doc
		doc = re.sub(r'\s$', "", doc) # remove space at the end of a document
		processed.append(doc.lower())
	return processed

In [47]:
#get similarity score
def sbert(ref, gen):
    sim_model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name="cosine")

    # Compute embeddings for both lists
    embeddings_ref = sim_model.encode(ref)
    embeddings_gen = sim_model.encode(gen)


    # Compute cosine similarities
    similarities = sim_model.similarity(embeddings_ref, embeddings_gen).numpy()
    return similarities

In [48]:
#correlation matrix and reorder them based on the hungarian algorithm
def match(ref, gen):
    try:
        ref_clean = preProcessText(ref)
        gen_clean = preProcessText(gen)
        matrix = sbert(ref_clean, gen_clean)
        row_ind, col_ind = linear_sum_assignment(1 - matrix)  # Minimize cost (1 - similarity)
        assigned_similarities = matrix[row_ind, col_ind]
        return np.mean(assigned_similarities), matrix, row_ind.tolist(), col_ind.tolist()
    except:
        print('error in matching' + ref[0])
        return np.nan

### packaging things for repeated excution

In [None]:
# start the process
llama = ChatOllama(model="llama3.2", temperature=1, base_url="http://127.0.0.1:11434")
mistral = ChatOllama(model="llama3.2", temperature=1, base_url="http://127.0.0.1:11434")
model_list = [llama, mistral]
prompts = {"no_prompt": None, 
           "prompt1": "You are an expert of this occupation: \"{title}\". Your task is to generate clear and concise task descriptions that reflect common responsibilities in this profession. Each description should be specific, action-oriented, and use professional language. Avoid unnecessary details—focus on the core action and purpose of the task.", 
          }

In [None]:
#save results
with open(folder_name + '/no_prompt.json', 'w') as f:
    f.write(result_df.to_json(index=True))

with open(folder_name + '/sys_prompt.txt', 'w') as f:
    f.write(system_prompt)


In [18]:
import pandas as pd
import time
from tqdm import tqdm
import json

times = {}

for name, prompt in prompts.items():
    # Step 1: Writing system prompt to file
    start = time.perf_counter()
    if prompt != None:
        with open(folder_name + '/sys_prompt.txt', 'a') as f:
            f.write(prompt + '\n')
        print(prompt)
    else:
        print("no prompt")
    times[f'{name}_write_prompt'] = time.perf_counter() - start

    for model in model_list:
        for i in range(2):
            # Step 2: LLM invocation and DataFrame update
            start = time.perf_counter()
            for title in tqdm(test_sample_list):
                print(title + str(i))
                generated_statements = task_gen(title, model, prompt)
                trial_df.loc[trial_df["title"] == title, "gen_task"] = pd.Series([generated_statements]).values
            result_df = trial_df.reset_index(drop=True)
            times[f'{name}_{model}_{i}_llm_invoke'] = time.perf_counter() - start

            # Step 3: Save initial result DataFrame to JSON
            start = time.perf_counter()
            with open(folder_name + '/' + name + '_'+str(i)+'_result.json', 'w') as f:
                f.write(result_df.to_json(index=True))
            times[f'{name}_{model}_{i}_save_result_json'] = time.perf_counter() - start

            # Step 4 & 5: Compute similarity scores and save to JSON
            try:
                # Step 4: Similarity computation (assuming match includes Hungarian algorithm)
                start = time.perf_counter()
                result_df[["score", "matrix", "ref_order", "gen_order"]] = result_df.apply(
                    lambda row: match(row["ref_task"], row["gen_task"]), axis=1
                ).apply(pd.Series)
                times[f'{name}_{model}_{i}_similarity_computation'] = time.perf_counter() - start

                # Step 5: Save similarity result to JSON
                start = time.perf_counter()
                with open(folder_name + '/' + name + '_'+str(i)+'_sim.json', 'w') as f:
                    f.write(result_df.to_json(index=True))
                times[f'{name}_{model}_{i}_save_sim_json'] = time.perf_counter() - start

            except Exception as e:
                print(e)
                continue

# Print timing results
for step, duration in times.items():
    print(f"{step}: {duration:.4f} seconds")

# Optional: Find the slowest step
slowest_step = max(times, key=times.get)
print(f"\nSlowest step: {slowest_step} took {times[slowest_step]:.4f} seconds")

no prompt


  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:23<00:00, 23.68s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:18<00:00, 18.14s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:17<00:00, 17.37s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:16<00:00, 16.12s/it]

not json
parsed string





You are an expert of this occupation: "{title}". Your task is to generate clear and concise task descriptions that reflect common responsibilities in this profession. Each description should be specific, action-oriented, and use professional language. Avoid unnecessary details—focus on the core action and purpose of the task.


  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:19<00:00, 19.23s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:24<00:00, 24.38s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:31<00:00, 31.22s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:18<00:00, 18.85s/it]

not json
parsed string





no_prompt_write_prompt: 0.0003 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_llm_invoke: 17.3758 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_save_result_json: 0.0008 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_similarity_computation: 2.8513 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_save_sim_json: 0.0022 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_llm_invoke: 16.1228 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_save_result_json: 0.0006 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_similarity_computation: 2.8331 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_save_sim_json: 0.0029 seconds
prompt1_write_prompt: 0.0023 seconds
prompt1_model='llama3.2' tempera

In [50]:
# %%
#for loading data
import pandas as pd
import json

#for llm
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel

#similarity
import regex as re
from sentence_transformers import SentenceTransformer
from scipy.optimize import linear_sum_assignment
import numpy as np

#counting
from tqdm import tqdm
import time

from datetime import datetime
import os

# computation
from concurrent.futures import ThreadPoolExecutor
import logging  # For logging to file and console

In [51]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
    handlers=[
        logging.FileHandler("execution_log.log"),  # Log to file
        logging.StreamHandler()  # Log to console
    ]
)

# Generate the folder name with current date and time
folder_name = 'results/task_match_'+datetime.now().strftime("%d%m_%H%M")+"/"


# Create the folder if it does not exist
os.makedirs(folder_name, exist_ok=True)

In [54]:
# read dataset and drop columns
job_statements = pd.read_excel("datasets/task_statements.xlsx")
job_statements.columns = job_statements.columns.str.lower()
job_statements = job_statements.drop(labels=["incumbents responding","date","domain source"], axis=1).rename(columns={"o*net-soc code":"code", "task type":"type", "task id": "id", "task":"ref_task"})
job_statements = job_statements[job_statements["type"].notna()]
job_statements["ind"] = job_statements["code"].str[:2]
job_statements = job_statements.groupby("title").agg({"ref_task":list, "ind": "first"}).reset_index().sort_values("ind")
sampled_occupation = job_statements.groupby('ind', group_keys=False).sample(frac=0.05, random_state=1) #43 samples


# %%
#for trial
trial_df = sampled_occupation.sample(1, random_state= 1)
test_sample_list =[trial_df.iloc[x]["title"] for x in range(len(trial_df))]
test_sample_list


['Customs Brokers']

In [56]:
def process_title(args):
    title, model, prompt = args
    start_time = datetime.now()
    tasks = task_gen(title, model, system=prompt)  # prompt as system
    logging.info(f"Single inference for {title}, duration: {datetime.now() - start_time}")
    return title, tasks  # Returns list of tasks

# %%
#get reference description
def get_des (title):
    task_list = sampled_occupation.query("title == @title")["ref_task"].iloc[0]
    return task_list

# %%
#invoke llm to generate tasks
def task_gen(title, model, system=None):
    """Generate exactly the required number of unique task statements for a title."""
    # Get reference task count (assumed function)
    ref_task_count = len(get_des(title))
    
    # Define JSON schema
    json_schema = {
        "type": "object",
        "properties": {
            "occupation": {"type": "string"},
            "tasks": {
                "type": "array",
                "items": {"type": "string"},
                "minItems": ref_task_count,
                "maxItems": ref_task_count
            }
        },
        "required": ["occupation", "tasks"]
    }

    # Construct prompt
    query = (
        f"List exactly {ref_task_count} unique task statements that the occupation '{title}' "
        "would perform at work. Ensure each statement is distinct, concise, and relevant."
    )
    
    # Use system prompt if provided, otherwise just human query
    if system:
        prompt_template = ChatPromptTemplate.from_messages([
            ("system", system),
            ("human", "{input}")
        ])
    else:
        prompt_template = ChatPromptTemplate.from_messages([
            ("human", "{input}")
        ])

    # Configure LLM with structured output
    llm = model.with_structured_output(schema=json_schema, method="json_schema")
    prompt = prompt_template.invoke({"input": query})

    # Invoke and parse response
    try:
        response = llm.invoke(prompt)
        # Expecting dict from with_structured_output
        tasks = response["tasks"]
        
        # Validate task count and uniqueness
        if len(tasks) != ref_task_count:
            logging.warning(f"Task count mismatch for {title}: got {len(tasks)}, expected {ref_task_count}")
            return tasks  # Return anyway, handle downstream
        if len(set(tasks)) < len(tasks):
            logging.warning(f"Duplicate tasks detected for {title}")
        
        return tasks
    except Exception as e:
        logging.error(f"Failed to generate tasks for {title}: {e}")
        # Return dummy tasks to avoid breaking pipeline
        return [f"Error: Task {i+1} for {title}" for i in range(ref_task_count)]
        
    
def preProcessText(text):
    """Preprocess a list of text strings."""
    processed = []
    for doc in text:
        if not isinstance(doc, str):  # Handle non-string (e.g., list or NaN)
            doc = str(doc)
        doc = re.sub(r"\\n", "", doc)
        doc = re.sub(r"\W", " ", doc)
        doc = re.sub(r"\d", " ", doc)
        doc = re.sub(r'\s+[a-z]\s+', " ", doc)
        doc = re.sub(r'^[a-z]\s+', "", doc)
        doc = re.sub(r'\s+', " ", doc)
        doc = re.sub(r'^\s', "", doc)
        doc = re.sub(r'\s$', "", doc)
        processed.append(doc.lower())
    return processed

def sbert_batch(ref_list, gen_list):
    """Compute similarity scores for all ref and gen texts in one batch."""
    sim_model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name="cosine")
    embeddings_ref = sim_model.encode(ref_list, batch_size=32)  # Batch embeddings
    embeddings_gen = sim_model.encode(gen_list, batch_size=32)
    similarities = sim_model.similarity(embeddings_ref, embeddings_gen).numpy()
    return similarities

def match_batch(ref_list, gen_list):
    """Batch process matching for multiple ref-gen pairs."""
    ref_clean = preProcessText(ref_list)
    gen_clean = preProcessText(gen_list)
    matrix = sbert_batch(ref_clean, gen_clean)
    
    # Process Hungarian algorithm per pair
    results = []
    for i in range(len(ref_list)):
        row_matrix = matrix[i:i+1, i:i+1] if len(ref_list) == 1 else matrix[i, i].reshape(1, 1)  # Handle single pair
        row_ind, col_ind = linear_sum_assignment(1 - row_matrix)  # Minimize cost
        score = matrix[i, i]  # Diagonal score for single pair
        results.append((score, row_matrix.tolist(), row_ind.tolist(), col_ind.tolist()))
    return results

def apply_match_batch(df):
    """Apply batched matching to the entire DataFrame."""
    ref_list = df["ref_task"].tolist()
    gen_list = df["gen_task"].tolist()
    results = match_batch(ref_list, gen_list)
    scores, matrices, ref_orders, gen_orders = zip(*results)
    df["score"] = scores
    df["matrix"] = matrices
    df["ref_order"] = ref_orders
    df["gen_order"] = gen_orders
    return df

In [57]:
llama3_2 = ChatOllama(model="llama3.2", temperature=1, base_url="http://127.0.0.1:11434")
llama3_1 = ChatOllama(model="llama3.1", temperature=1, base_url="http://127.0.0.1:11434")

model_list = [llama3_2, llama3_1]

prompts = {"no_prompt": None, 
           "prompt1": "You are an expert of this occupation: \"{title}\". Your task is to generate clear and concise task descriptions that reflect common responsibilities in this profession. Each description should be specific, action-oriented, and use professional language. Avoid unnecessary details—focus on the core action and purpose of the task.",}


In [60]:
logging.info("Script started")
for model in model_list:
    model_name = model.model
    logging.info(f"Processing model: {model_name}")
    model.invoke("Warm-up prompt")

    for name, prompt in prompts.items():
        if prompt:
            start_time = datetime.now()
            with open(f"{folder_name}/sys_prompt.txt", "a") as f:
                f.write(prompt + "\n")
            logging.info(f"Wrote prompt {name}, duration: {datetime.now() - start_time}")

        all_results_df = trial_df.copy()
        all_results_df["gen_task"] = None  # Now a list of tasks
        all_results_df["iteration"] = None

        for i in range(5):
            start_time = datetime.now()
            with ThreadPoolExecutor(max_workers=8) as executor:
                results = list(tqdm(
                    executor.map(process_title, [(title, model, prompt) for title in test_sample_list]),
                    total=len(test_sample_list),
                    desc=f"{model_name}-{name}-{i}"
                ))
            logging.info(f"ThreadPoolExecutor for {model_name}-{name}-{i}, duration: {datetime.now() - start_time}")

            temp_df = trial_df.copy()
            for title, tasks in results:
                temp_df.loc[temp_df["title"] == title, "gen_task"] = pd.Series([tasks]).values  # Store as list
            temp_df["iteration"] = i
            all_results_df = pd.concat([all_results_df, temp_df], ignore_index=True)

        # Batch match (adjust for list of tasks)
        start_time = datetime.now()
        # Flatten gen_task lists to strings for matching (assuming one task per ref_task for simplicity)
        all_results_df["gen_task_str"] = all_results_df["gen_task"].apply(lambda x: x[0] if x and len(x) > 0 else "")
        all_results_df = apply_match_batch(all_results_df.rename(columns={"gen_task_str": "gen_task"}))
        logging.info(f"Batch matching for {model_name}-{name}, duration: {datetime.now() - start_time}")

        # Save once
        start_time = datetime.now()
        all_results_df = all_results_df.reset_index(drop=True)
        with open(f"{folder_name}/{model_name}_{name}_results.json", "w") as f:
            f.write(all_results_df.to_json(index=True))
        logging.info(f"Wrote results JSON for {model_name}-{name}, duration: {datetime.now() - start_time}")

logging.info("Script completed")

2025-03-19 17:50:41,921 - Script started
2025-03-19 17:50:41,921 - Processing model: llama3.2
2025-03-19 17:50:42,271 - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
llama3.2-no_prompt-0:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-19 17:50:49,152 - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-03-19 17:51:21,044 - Single inference for Customs Brokers, duration: 0:00:32.083894
llama3.2-no_prompt-0: 100%|██████████| 1/1 [00:32<00:00, 32.08s/it]
2025-03-19 17:51:21,048 - ThreadPoolExecutor for llama3.2-no_prompt-0, duration: 0:00:32.087786
llama3.2-no_prompt-1:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-19 17:51:21,177 - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-03-19 17:51:38,689 - Single inference for Customs Brokers, duration: 0:00:17.637842
llama3.2-no_prompt-1:   0%|          | 0/1 [00:17<?, ?it/s]


KeyboardInterrupt: 