In [6]:
#for loading data
import pandas as pd
import json

#for llm
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel

#similarity
import regex as re
from sentence_transformers import SentenceTransformer
from scipy.optimize import linear_sum_assignment
import numpy as np

#counting
from tqdm import tqdm

from datetime import datetime
import os

In [7]:
# Generate the folder name with current date and time
folder_name = 'results/task_match_'+datetime.now().strftime("%d%m_%H%M")+"/"

# Create the folder if it does not exist
os.makedirs(folder_name, exist_ok=True)

### Preprocess data and sampling

In [8]:
# read dataset and drop columns
job_statements = pd.read_excel("datasets/task_statements.xlsx")
job_statements.columns = job_statements.columns.str.lower()
job_statements = job_statements.drop(labels=["incumbents responding","date","domain source"], axis=1).rename(columns={"o*net-soc code":"code", "task type":"type", "task id": "id", "task":"ref_task"})
job_statements = job_statements[~job_statements["type"].str.contains("Supplemental", case=False, na=True)]
job_statements["ind"] = job_statements["code"].str[:2]
job_statements = job_statements.groupby("title").agg({"ref_task":list, "ind": "first"}).reset_index().sort_values("ind")
sampled_occupation = job_statements.groupby('ind', group_keys=False).sample(frac=0.05, random_state=1) #43 samples


In [9]:
#for trial
trial_df = sampled_occupation.sample(1, random_state= 1)
test_sample_list =[trial_df.iloc[x]["title"] for x in range(len(trial_df))]
test_sample_list

['Customs Brokers']

### Set up functions

In [10]:
#get reference description
def get_des (title):
    task_list = sampled_occupation.query("title == @title")["ref_task"].iloc[0]
    return task_list

In [11]:
#invoke llm to generate tasks
def task_gen(title, model, system = None):
    json_schema = {
        "type": "object",
        "properties": {
            "occupation": {
                "type": "string"
            },
            "tasks": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "minItems": len(get_des(title)),
                "maxItems": len(get_des(title))
            }
        },
        "required": ["occupation", "tasks"]
    }

    #initialize model

    query = "List out exactly "+str(len(get_des(title)))+" task statements that the occupation \""+ title +"\" would perform at work.Make sure each statement is unique and different from one another."

    if system == None:
        prompt_template = ChatPromptTemplate([
            ("human","{input}")
            ]
        )
    else:
        prompt_template = ChatPromptTemplate([
            ("system", system),
            ("human","{input}")
            ]
        )

    llm = model.with_structured_output(schema=json_schema, method="json_schema")

    prompt = prompt_template.invoke({"input": query, "title": title})
    # keep running until the number of parsed tasks is equal to the number of reference tasks
    for i in range (3):
        response = llm.invoke(prompt)
        #parse response
        try:
            parsed = json.loads(response["tasks"])
            print('parsed json')
        except:
            print('not json')
            try:
               parsed = response["tasks"]
               print('parsed string')
            except:
                print('not string')
                continue
        try:
            if len(parsed) == len(get_des(title)):
                return parsed
            else:
                print('not equal, parsed:', len(parsed), 'ref:', len(get_des(title)))
        except Exception as e:
            #try 3 more times, and if it still fails, return the parsed
            print(e)
            continue
        
    

In [12]:
#pre process text
def preProcessText(text=list):
	processed = []
	for doc in text:
		doc = re.sub(r"\\n", "", doc)
		doc = re.sub(r"\W", " ", doc) #remove non words char
		doc = re.sub(r"\d"," ", doc) #remove digits char
		doc = re.sub(r'\s+[a-z]\s+', " ", doc) # remove a single char
		doc = re.sub(r'^[a-z]\s+', "", doc) #remove a single character at the start of a document
		doc = re.sub(r'\s+', " ", doc)  #replace an extra space with a single space
		doc = re.sub(r'^\s', "", doc) # remove space at the start of a doc
		doc = re.sub(r'\s$', "", doc) # remove space at the end of a document
		processed.append(doc.lower())
	return processed

In [13]:
#get similarity score
def sbert(ref, gen):
    sim_model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name="cosine")

    # Compute embeddings for both lists
    embeddings_ref = sim_model.encode(ref)
    embeddings_gen = sim_model.encode(gen)


    # Compute cosine similarities
    similarities = sim_model.similarity(embeddings_ref, embeddings_gen).numpy()
    return similarities

In [14]:
#correlation matrix and reorder them based on the hungarian algorithm
def match(ref, gen):
    try:
        ref_clean = preProcessText(ref)
        gen_clean = preProcessText(gen)
        matrix = sbert(ref_clean, gen_clean)
        row_ind, col_ind = linear_sum_assignment(1 - matrix)  # Minimize cost (1 - similarity)
        assigned_similarities = matrix[row_ind, col_ind]
        return np.mean(assigned_similarities), matrix, row_ind.tolist(), col_ind.tolist()
    except:
        print('error in matching' + ref[0])
        return np.nan

### packaging things for repeated excution

In [17]:
# start the process
llama = ChatOllama(model="llama3.2", temperature=1, base_url="http://127.0.0.1:11434")
mistral = ChatOllama(model="llama3.2", temperature=1, base_url="http://127.0.0.1:11434")
model_list = [llama, mistral]
prompts = {"no_prompt": None, 
           "prompt1": "You are an expert of this occupation: \"{title}\". Your task is to generate clear and concise task descriptions that reflect common responsibilities in this profession. Each description should be specific, action-oriented, and use professional language. Avoid unnecessary details—focus on the core action and purpose of the task.", 
          }

In [15]:
import time

times = {}
# run 5 times each for each prompt
for name, prompt in prompts.items():
    if prompt != None:
        with open(folder_name + '/sys_prompt.txt', 'a') as f:
            f.write(prompt + '\n')
        print(prompt)
    else:
        print("no prompt")

    for model in model_list:
        for i in range (2):
            # invoke llm for each title
            for title in tqdm(test_sample_list):
                print(title + str(i))
                generated_statements = task_gen(title, model, prompt)
                trial_df.loc[trial_df["title"] == title, "gen_task"] = pd.Series([generated_statements]).values
            result_df = trial_df.reset_index(drop=True)
            with open(folder_name + '/' + name + '_'+str(i)+'_result.json', 'w') as f:
                f.write(result_df.to_json(index=True))

            try:
                result_df[["score", "matrix", "ref_order", "gen_order"]] = result_df.apply(lambda row: match(row["ref_task"], row["gen_task"]), axis=1).apply(pd.Series)

                with open(folder_name + '/' + name + '_'+str(i)+'_sim.json', 'w') as f:
                    f.write(result_df.to_json(index=True))
            except Exception as e:
                print(e)
                continue

no prompt


  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:11<00:00, 11.07s/it]

not json
parsed string





error in matchingPrepare and process import and export documentation according to customs regulations, laws, or procedures.
Columns must be same length as key


  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:12<00:00, 12.78s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


  0%|          | 0/1 [00:15<?, ?it/s]


KeyboardInterrupt: 

In [None]:
#save results
with open(folder_name + '/no_prompt.json', 'w') as f:
    f.write(result_df.to_json(index=True))

with open(folder_name + '/sys_prompt.txt', 'w') as f:
    f.write(system_prompt)


In [18]:
import pandas as pd
import time
from tqdm import tqdm
import json

times = {}

for name, prompt in prompts.items():
    # Step 1: Writing system prompt to file
    start = time.perf_counter()
    if prompt != None:
        with open(folder_name + '/sys_prompt.txt', 'a') as f:
            f.write(prompt + '\n')
        print(prompt)
    else:
        print("no prompt")
    times[f'{name}_write_prompt'] = time.perf_counter() - start

    for model in model_list:
        for i in range(2):
            # Step 2: LLM invocation and DataFrame update
            start = time.perf_counter()
            for title in tqdm(test_sample_list):
                print(title + str(i))
                generated_statements = task_gen(title, model, prompt)
                trial_df.loc[trial_df["title"] == title, "gen_task"] = pd.Series([generated_statements]).values
            result_df = trial_df.reset_index(drop=True)
            times[f'{name}_{model}_{i}_llm_invoke'] = time.perf_counter() - start

            # Step 3: Save initial result DataFrame to JSON
            start = time.perf_counter()
            with open(folder_name + '/' + name + '_'+str(i)+'_result.json', 'w') as f:
                f.write(result_df.to_json(index=True))
            times[f'{name}_{model}_{i}_save_result_json'] = time.perf_counter() - start

            # Step 4 & 5: Compute similarity scores and save to JSON
            try:
                # Step 4: Similarity computation (assuming match includes Hungarian algorithm)
                start = time.perf_counter()
                result_df[["score", "matrix", "ref_order", "gen_order"]] = result_df.apply(
                    lambda row: match(row["ref_task"], row["gen_task"]), axis=1
                ).apply(pd.Series)
                times[f'{name}_{model}_{i}_similarity_computation'] = time.perf_counter() - start

                # Step 5: Save similarity result to JSON
                start = time.perf_counter()
                with open(folder_name + '/' + name + '_'+str(i)+'_sim.json', 'w') as f:
                    f.write(result_df.to_json(index=True))
                times[f'{name}_{model}_{i}_save_sim_json'] = time.perf_counter() - start

            except Exception as e:
                print(e)
                continue

# Print timing results
for step, duration in times.items():
    print(f"{step}: {duration:.4f} seconds")

# Optional: Find the slowest step
slowest_step = max(times, key=times.get)
print(f"\nSlowest step: {slowest_step} took {times[slowest_step]:.4f} seconds")

no prompt


  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:23<00:00, 23.68s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:18<00:00, 18.14s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:17<00:00, 17.37s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:16<00:00, 16.12s/it]

not json
parsed string





You are an expert of this occupation: "{title}". Your task is to generate clear and concise task descriptions that reflect common responsibilities in this profession. Each description should be specific, action-oriented, and use professional language. Avoid unnecessary details—focus on the core action and purpose of the task.


  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:19<00:00, 19.23s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:24<00:00, 24.38s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers0


100%|██████████| 1/1 [00:31<00:00, 31.22s/it]

not json
parsed string



  0%|          | 0/1 [00:00<?, ?it/s]

Customs Brokers1


100%|██████████| 1/1 [00:18<00:00, 18.85s/it]

not json
parsed string





no_prompt_write_prompt: 0.0003 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_llm_invoke: 17.3758 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_save_result_json: 0.0008 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_similarity_computation: 2.8513 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_0_save_sim_json: 0.0022 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_llm_invoke: 16.1228 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_save_result_json: 0.0006 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_similarity_computation: 2.8331 seconds
no_prompt_model='llama3.2' temperature=1.0 base_url='http://127.0.0.1:11434'_1_save_sim_json: 0.0029 seconds
prompt1_write_prompt: 0.0023 seconds
prompt1_model='llama3.2' tempera