In [1]:
#for loading data
import pandas as pd
import json

#for llm
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel

#logging
import pickle

#similarity
import regex as re
from sentence_transformers import SentenceTransformer

#visualization
#plot matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load data

In [2]:
job_statements = pd.read_excel("datasets/task_statements.xlsx")
job_statements.columns = job_statements.columns.str.lower()
job_statements = job_statements.drop(labels=["incumbents responding","date","domain source"], axis=1).rename(columns={"o*net-soc code":"code", "task type":"type", "task id": "id", "task":"ref_task"})
job_statements = job_statements[~job_statements["type"].str.contains("Supplemental", case=False, na=True)]
job_statements["ind"] = job_statements["code"].str[:2]
job_statements = job_statements.groupby("title").agg({"ref_task":list, "ind": "first"}).reset_index().sort_values("ind")
sampled_occupation = job_statements.groupby('ind', group_keys=False).sample(frac=0.05, random_state=1) #43 samples
sampled_occupation

Unnamed: 0,title,ref_task,ind
478,Lodging Managers,[Answer inquiries pertaining to hotel policies...,11
777,Spa Managers,"[Respond to customer inquiries or complaints.,...",11
833,Training and Development Managers,[Analyze training needs to develop new trainin...,11
203,Customs Brokers,[Prepare and process import and export documen...,13
386,Government Property Inspectors and Investigators,"[Prepare correspondence, reports of inspection...",13
856,Video Game Designers,[Balance and adjust gameplay experiences to en...,15
559,Nanosystems Engineers,[Provide scientific or technical guidance or e...,17
430,Industrial Engineers,"[Estimate production costs, cost saving method...",17
799,Surveying and Mapping Technicians,"[Position and hold the vertical rods, or targe...",17
170,Conservation Scientists,[Apply principles of specialized fields of sci...,19


In [40]:
#for trial
trial_df = sampled_occupation.sample(5, random_state= 1)
test_sample_list =[trial_df.iloc[x]["title"] for x in range(5)]

test_sample_list

['Customs Brokers',
 'Training and Development Managers',
 'Cooks, Institution and Cafeteria',
 'Painting, Coating, and Decorating Workers',
 'Graphic Designers']

### set up matching function

In [4]:
#get reference description
def get_des (occupation):
    task_list = sampled_occupation.query("title == @occupation")["ref_task"].iloc[0]
    return task_list

In [5]:
def task_gen(occupation,model):
    class task_description(BaseModel):
        occupation: str
        statements: list[str]


    #initialize model
    model= model

    query = "Generate "+str(len(get_des(occupation)))+" task statements that "+ occupation +" would perform at work."

    prompt_template = ChatPromptTemplate([
        # ("system", "your role is a {name}. Respond like a {name}."),
        ("human","{input}")
        ]
    )
    structured_llm = model.with_structured_output(schema=task_description.model_json_schema())

    prompt = prompt_template.invoke({"occupation": occupation, "input": query})
    response = structured_llm.invoke(prompt)

    return response
    

In [6]:
# parse response
def parse_response(response):
    try:
        parsed = json.loads(response["statements"])
        return parsed
    except:
        try:
            parsed = response["statements"]
            return parsed
        except:
            return np.nan

In [7]:
#pre process text
def preProcessText(text=list):
	processed = []
	for doc in text:
		doc = re.sub(r"\\n", "", doc)
		doc = re.sub(r"\W", " ", doc) #remove non words char
		doc = re.sub(r"\d"," ", doc) #remove digits char
		doc = re.sub(r'\s+[a-z]\s+', "", doc) # remove a single char
		doc = re.sub(r'^[a-z]\s+', "", doc) #remove a single character at the start of a document
		doc = re.sub(r'\s+', " ", doc)  #replace an extra space with a single space
		doc = re.sub(r'^\s', "", doc) # remove space at the start of a doc
		doc = re.sub(r'\s$', "", doc) # remove space at the end of a document
		processed.append(doc.lower())
	return processed

In [23]:
#get similarity score
def sbert(gen, ref):
    sim_model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name="cosine")

    # Compute embeddings for both lists
    embeddings_gen = sim_model.encode(gen)
    embeddings_ref = sim_model.encode(ref)

    # Compute cosine similarities
    similarities = float(sim_model.similarity(embeddings_gen, embeddings_ref))
    return similarities

In [9]:
def add_record(occupation, ref, gen, df):
    new_row = pd.DataFrame({'occupation': occupation, 'ref_statements': [ref], "gen_statements":[gen]})
    df = pd.concat([df, new_row], ignore_index=True)
    return df

In [10]:
#plot graph
def plot(similarities):
    x_label = [x+1 for x in range(len(similarities))]
    y_label = [chr(num + 96) for num in x_label]
    plt.figure(figsize=(12, 12))
    sns.heatmap(similarities, annot=True, fmt=".3f", cmap="crest", xticklabels=x_label, yticklabels=y_label)
    plt.title("Sentence Cosine Similarity for "+ test_occ)
    plt.xlabel("O*NET Data")
    plt.ylabel("Generated Sentence")
    plt.savefig("results/"+test_occ+".png")
    plt.show()
    return plt

### packaging things for repeated excution

In [42]:
model = ChatOllama(model="llama3.2", temperature=0.8)

In [12]:
test_sample_list

['Customs Brokers',
 'Training and Development Managers',
 'Cooks, Institution and Cafeteria',
 'Painting, Coating, and Decorating Workers',
 'Graphic Designers']

In [43]:
for occu in test_sample_list:
    generated_statements = task_gen(occu, model)
    trial_df.loc[trial_df["title"] == occu, "gen_task"] = [generated_statements]

trial_df


Unnamed: 0,title,ref_task,ind,gen_task
203,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'statements':..."
833,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...
178,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cooks', 'statements': '[""Prepa..."
607,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'statements': '[""Ap..."
388,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'statements..."


In [44]:
trial_df["parsed_response"] = trial_df["gen_task"].apply(parse_response)
trial_df = trial_df.dropna()
trial_df

# trial_df

Unnamed: 0,title,ref_task,ind,gen_task,parsed_response
203,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'statements':...",[Release cargo upon presentation of proper cus...
833,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...,"[Create a training program for new hires, Deve..."
178,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cooks', 'statements': '[""Prepa...","[Prepares meals for the cafeteria staff, Ensur..."
607,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'statements': '[""Ap...","[Apply paint to walls, Clean brushes, Maintain..."
388,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'statements...","[Create a new logo design, Develop a marketing..."


In [45]:
def match(gen, ref):
    try:
        gen_clean = " ".join(preProcessText(gen))
        ref_clean = " ".join(preProcessText(ref))
        return sbert(gen_clean, ref_clean)
    except:
        return np.nan


In [47]:
result_df = trial_df
result_df

Unnamed: 0,title,ref_task,ind,gen_task,parsed_response
203,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'statements':...",[Release cargo upon presentation of proper cus...
833,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...,"[Create a training program for new hires, Deve..."
178,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cooks', 'statements': '[""Prepa...","[Prepares meals for the cafeteria staff, Ensur..."
607,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'statements': '[""Ap...","[Apply paint to walls, Clean brushes, Maintain..."
388,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'statements...","[Create a new logo design, Develop a marketing..."


In [58]:
result_df = result_df.reset_index(drop=True)
result_df

Unnamed: 0,title,ref_task,ind,gen_task,parsed_response,score
0,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'statements':...",[Release cargo upon presentation of proper cus...,
1,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...,"[Create a training program for new hires, Deve...",
2,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cooks', 'statements': '[""Prepa...","[Prepares meals for the cafeteria staff, Ensur...",
3,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'statements': '[""Ap...","[Apply paint to walls, Clean brushes, Maintain...",
4,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'statements...","[Create a new logo design, Develop a marketing...",


In [59]:
result_df["score"] = result_df.apply(lambda row: match(row["ref_task"], row["parsed_response"]), axis=1)
result_df

Unnamed: 0,title,ref_task,ind,gen_task,parsed_response,score
0,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'statements':...",[Release cargo upon presentation of proper cus...,0.913035
1,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...,"[Create a training program for new hires, Deve...",0.724239
2,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cooks', 'statements': '[""Prepa...","[Prepares meals for the cafeteria staff, Ensur...",0.81614
3,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'statements': '[""Ap...","[Apply paint to walls, Clean brushes, Maintain...",0.617477
4,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'statements...","[Create a new logo design, Develop a marketing...",0.492896


In [76]:
highest = result_df.loc[result_df["score"].idxmax()]
highest["parsed_response"]

['Release cargo upon presentation of proper customs documents',
 'Verify the accuracy and completeness of customs forms',
 'Clear goods for export or import by making necessary entries in relevant records',
 'Conduct inspections to ensure compliance with regulations',
 'Respond to inquiries from regulatory authorities regarding shipments',
 'Negotiate with suppliers to resolve discrepancies or claims',
 'Maintain accurate records of all transactions and correspondence',
 'Ensure compliance with all applicable laws and regulations',
 'Provide clients with regular updates on shipment status',
 'Coordinate logistics and transportation arrangements',
 'Prepare and submit customs declarations for release of goods',
 'Verify the authenticity of shipping documents',
 'Monitor and respond to any changes in regulations or policies']

In [72]:
mean = result_df["score"].mean()
mean

0.7127574265003205