In [3]:
#for loading data
import pandas as pd
import json

#for llm
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel

#similarity
import regex as re
from sentence_transformers import SentenceTransformer
from scipy.optimize import linear_sum_assignment

#visualization
#plot matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Preprocess data and sampling

In [5]:
job_statements = pd.read_excel("datasets/task_statements.xlsx")
job_statements.columns = job_statements.columns.str.lower()
job_statements = job_statements.drop(labels=["incumbents responding","date","domain source"], axis=1).rename(columns={"o*net-soc code":"code", "task type":"type", "task id": "id", "task":"ref_task"})
job_statements = job_statements[~job_statements["type"].str.contains("Supplemental", case=False, na=True)]
job_statements["ind"] = job_statements["code"].str[:2]
job_statements = job_statements.groupby("title").agg({"ref_task":list, "ind": "first"}).reset_index().sort_values("ind")
sampled_occupation = job_statements.groupby('ind', group_keys=False).sample(frac=0.05, random_state=1) #43 samples


### or we filter by occupations with core tasks only

In [None]:
#load dataset task statement file
ts = pd.read_excel('datasets/task_statements.xlsx')
ts.columns = ts.columns.str.lower().str.replace(" ","_")
ts = ts.drop(labels=['date', 'domain_source', 'incumbents_responding'], axis=1)
ts = ts.rename(columns={'o*net-soc_code':'code', 'task':'statement'})
print(ts["code"].nunique())
#filter out task_type is empty
cts = ts[ts["task_type"].notnull()]

#filter out supplemental task in tasktype
cts = cts[cts["task_type"] != "Supplemental"]
cts["code"].nunique()


In [23]:
#for trial
trial_df = sampled_occupation.sample(5, random_state= 1)
test_sample_list =[trial_df.iloc[x]["title"] for x in range(5)]
test_sample_list

['Customs Brokers',
 'Training and Development Managers',
 'Cooks, Institution and Cafeteria',
 'Painting, Coating, and Decorating Workers',
 'Graphic Designers']

### Set up functions

In [6]:
#get reference description
def get_des (title):
    task_list = sampled_occupation.query("title == @title")["ref_task"].iloc[0]
    return task_list

In [None]:
def task_gen(title,model, system = None, structure = False):
    class occupation(BaseModel):
        occupation: str
        tasks: list[str]

    #initialize model

    query = "Generate "+str(len(get_des(title)))+" task statements that "+ title +" would perform at work."

    if system == None:
        prompt_template = ChatPromptTemplate([
            ("human","{input}")
            ]
        )
    else:
        prompt_template = ChatPromptTemplate([
            ("system", system),
            ("human","{input}")
            ]
        )

    if structure == False:
        llm = model

    if structure == True:
        llm = model.with_structured_output(schema=occupation, method="json_schema")

    prompt = prompt_template.invoke({"input": query, "title": title})
    # keep running until the number of parsed tasks is equal to the number of reference tasks
    while True:
        response = llm.invoke(prompt)
        #parse response
        try:
            parsed = json.loads(response["tasks"])
        except:
            print('not json')
            try:
               parsed = response["tasks"]
            except:
                print('not string')
                continue
        try:
            if len(parsed) == len(get_des(title)):
                return parsed
            else:
                print('not equal, parsed:', len(parsed), 'ref:', len(get_des(title)))
        except Exception as e:
            print(e)
            continue
        
    

In [10]:
#pre process text
def preProcessText(text=list):
	processed = []
	for doc in text:
		doc = re.sub(r"\\n", "", doc)
		doc = re.sub(r"\W", " ", doc) #remove non words char
		doc = re.sub(r"\d"," ", doc) #remove digits char
		doc = re.sub(r'\s+[a-z]\s+', " ", doc) # remove a single char
		doc = re.sub(r'^[a-z]\s+', "", doc) #remove a single character at the start of a document
		doc = re.sub(r'\s+', " ", doc)  #replace an extra space with a single space
		doc = re.sub(r'^\s', "", doc) # remove space at the start of a doc
		doc = re.sub(r'\s$', "", doc) # remove space at the end of a document
		processed.append(doc.lower())
	return processed

In [11]:
#get similarity score
def sbert(ref, gen):
    sim_model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name="cosine")

    # Compute embeddings for both lists
    embeddings_ref = sim_model.encode(ref)
    embeddings_gen = sim_model.encode(gen)


    # Compute cosine similarities
    similarities = sim_model.similarity(embeddings_ref, embeddings_gen).numpy()
    return similarities

In [12]:
#plot graph
def plot(similarities):
    x_label = [x+1 for x in range(len(similarities))]
    y_label = [chr(num + 96) for num in x_label]
    plt.figure(figsize=(12, 12))
    sns.heatmap(similarities, annot=True, fmt=".3f", cmap="crest", xticklabels=x_label, yticklabels=y_label)
    plt.title("Sentence Cosine Similarity for "+ test_occ)
    plt.xlabel("O*NET Data")
    plt.ylabel("Generated Sentence")
    plt.savefig("results/"+test_occ+".png")
    plt.show()
    return plt

In [13]:
def match(ref, gen):
    try:
        ref_clean = preProcessText(ref)
        gen_clean = preProcessText(gen)
        matrix = sbert(ref_clean, gen_clean)
        row_ind, col_ind = linear_sum_assignment(1 - matrix)  # Minimize cost (1 - similarity)
        assigned_similarities = matrix[row_ind, col_ind]
        return np.mean(assigned_similarities), matrix, row_ind.tolist(), col_ind.tolist()
    except:
        return np.nan

### packaging things for repeated excution

In [22]:
model = ChatOllama(model="llama3.1", temperature=1)

In [15]:
for title in test_sample_list:
    generated_statements = task_gen(title, model)
    trial_df.loc[trial_df["title"] == title, "gen_task"] = [generated_statements]

trial_df

Unnamed: 0,title,ref_task,ind,gen_task
203,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'tasks': '[""C..."
833,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...
178,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cook', 'tasks': '[""Prepare ing..."
607,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'tasks': '[""Applyin..."
388,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'tasks': '[..."


In [16]:
result_df = trial_df.reset_index(drop=True)
result_df["parsed_response"] = result_df["gen_task"].apply(parse_response)
result_df = result_df.dropna()
result_df

Unnamed: 0,title,ref_task,ind,gen_task,parsed_response
0,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'tasks': '[""C...","[Clearing shipments of imported goods, Ensurin..."
1,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...,"[Developing employee training programs, Creati..."
2,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cook', 'tasks': '[""Prepare ing...","[Prepare ingredients, Cook meals, Clean kitchen]"
3,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'tasks': '[""Applyin...","[Applying paint to walls, Applying paint to ce..."
4,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'tasks': '[...","[Create visual elements for designs, Develop a..."


In [17]:
result_df[["score", "matrix", "ref_order", "gen_order"]] = result_df.apply(lambda row: match(row["ref_task"], row["parsed_response"]), axis=1).apply(pd.Series)
result_df

Unnamed: 0,title,ref_task,ind,gen_task,parsed_response,score,matrix,ref_order,gen_order
0,Customs Brokers,[Prepare and process import and export documen...,13,"{'occupation': 'Customs Broker', 'tasks': '[""C...","[Clearing shipments of imported goods, Ensurin...",0.437229,"[[0.5195479, 0.7806119, 0.49435085, 0.29045552...","[0, 1, 2, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[1, 14, 10, 2, 16, 17, 11, 7, 0, 3, 15, 13, 8,..."
1,Training and Development Managers,[Analyze training needs to develop new trainin...,11,{'occupation': 'Training and Development Manag...,"[Developing employee training programs, Creati...",0.501708,"[[0.69682044, 0.3042237, 0.3240591, 0.39522266...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[8, 10, 0, 9, 4, 5, 1, 3, 7, 2, 6]"
2,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,"{'occupation': 'Cook', 'tasks': '[""Prepare ing...","[Prepare ingredients, Cook meals, Clean kitchen]",0.54862,"[[0.32628417, 0.39149806, 0.23261246], [0.4852...","[1, 3, 5]","[1, 0, 2]"
3,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,"{'occupation': 'Painting', 'tasks': '[""Applyin...","[Applying paint to walls, Applying paint to ce...",0.312866,"[[0.458075, 0.44365197], [0.09448831, 0.123445...","[0, 2]","[0, 1]"
4,Graphic Designers,[Key information into computer equipment to cr...,27,"{'occupation': 'Graphic Designer', 'tasks': '[...","[Create visual elements for designs, Develop a...",0.431067,"[[0.465727, 0.17119342, 0.43691003, 0.3306257,...","[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 1...","[12, 13, 0, 1, 4, 7, 5, 3, 8, 10, 11, 2, 14, 9..."


In [47]:
with open('result.json', 'w') as f:
    f.write(result_df.to_json(index=True))

In [51]:
analysis = result_df.describe().T
analysis["max_title"] = result_df.loc[result_df["score"].idxmax(), "title"]
analysis["min_title"] = result_df.loc[result_df["score"].idxmin(), "title"]
analysis

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,max_title,min_title
score,5.0,0.516068,0.083808,0.376314,0.520146,0.534684,0.54862,0.600575,Training and Development Managers,"Painting, Coating, and Decorating Workers"


In [4]:
def test_gen(title,model, system = None, structure = False):
    class occupation(BaseModel):
        occupation: str
        tasks: list[str]

    #initialize model

    query = "Generate "+str(len(get_des(title)))+" task statements that "+ title +" would perform at work."

    if system == None:
        prompt_template = ChatPromptTemplate([
            ("human","{input}")
            ]
        )
    else:
        prompt_template = ChatPromptTemplate([
            ("system", system),
            ("human","{input}")
            ]
        )

    if structure == False:
        llm = model

    if structure == True:
        llm = model.with_structured_output(schema=occupation, method="json_schema")

    prompt = prompt_template.invoke({"input": query, "title": title})
    response = llm.invoke(prompt)
    return response

In [None]:
model = ChatOllama(model="llama3.2", temperature=1)

In [11]:
test = test_gen(model=model, title="Spa Managers", structure=False)
test

AIMessage(content="Here are 21 task statements that a Spa Manager might perform at work:\n\n1. Develop and implement spa membership programs to increase customer loyalty and retention.\n2. Collaborate with the sales team to create effective marketing campaigns to attract new clients.\n3. Manage spa staff scheduling, including holiday scheduling, shifts, and overtime.\n4. Conduct regular staff performance reviews to identify areas for improvement and provide feedback.\n5. Implement and enforce a customer service policy to ensure high standards of hospitality are met.\n6. Plan and execute special events such as bridal showers, birthday parties, and corporate retreats.\n7. Manage the spa's budget, including forecasting income, managing expenses, and tracking financial reports.\n8. Research and stay up-to-date on industry trends, products, and services to maintain a competitive edge.\n9. Develop and maintain relationships with suppliers, including product vendors and service providers.\n10

In [12]:
test.content

"Here are 21 task statements that a Spa Manager might perform at work:\n\n1. Develop and implement spa membership programs to increase customer loyalty and retention.\n2. Collaborate with the sales team to create effective marketing campaigns to attract new clients.\n3. Manage spa staff scheduling, including holiday scheduling, shifts, and overtime.\n4. Conduct regular staff performance reviews to identify areas for improvement and provide feedback.\n5. Implement and enforce a customer service policy to ensure high standards of hospitality are met.\n6. Plan and execute special events such as bridal showers, birthday parties, and corporate retreats.\n7. Manage the spa's budget, including forecasting income, managing expenses, and tracking financial reports.\n8. Research and stay up-to-date on industry trends, products, and services to maintain a competitive edge.\n9. Develop and maintain relationships with suppliers, including product vendors and service providers.\n10. Conduct regular 

In [20]:
prompt_template = ChatPromptTemplate([
            ("human","can you make the response in a structured json format"),
            ("ai", " {input} ")
            ]
        )

class occupation(BaseModel):
        occupation: str
        tasks: list[str]
llm = model
# llm = model.with_structured_output(schema=occupation, method="json_schema")
prompt = prompt_template.invoke({"input": test.content})
answer = llm.invoke(prompt)
answer

AIMessage(content=' Here is an example JSON response based on task 7:\n\n```\n{\n  "task": 7,\n  "description": "Manage spa budget",\n  "tasks": [\n    {\n      "task_id": 1,\n      "description": "Forecast income"\n    },\n    {\n      "task_id": 2,\n      "description": "Manage expenses"\n    },\n    {\n      "task_id": 3,\n      "description": "Track financial reports"\n    }\n  ],\n  "status": null\n}\n```\n\nIn this example, the JSON response includes a summary of task 7 (managing spa budget), as well as two subtasks that are part of managing that budget. The `status` field is blank because it hasn\'t been completed yet.\n\nPlease let me know if you want me to generate anything else!', additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2025-03-12T13:54:39.782009Z', 'done': True, 'done_reason': 'stop', 'total_duration': 5048475200, 'load_duration': 22388300, 'prompt_eval_count': 437, 'prompt_eval_duration': 263000000, 'eval_count': 171, 'eval_duration': 47

In [1]:
# create a function to create json schema depend on the number of sentences needed
def create_json_schema(num_sentences):
    json_schema = {
        "type": "object",
        "properties": {
            "occupation": {
                "type": "string"
            },
            "tasks": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "minItems": num_sentences,
                "maxItems": num_sentences
            }
        },
        "required": ["occupation", "tasks"]
    }
    return json_schema


In [2]:
create_json_schema(5)

{'type': 'object',
 'properties': {'occupation': {'type': 'string'},
  'tasks': {'type': 'array',
   'items': {'type': 'string'},
   'minItems': 5,
   'maxItems': 5}},
 'required': ['occupation', 'tasks']}

In [14]:
def test_gen(title, system = None, structure = False):
    jsc = create_json_schema(len(get_des(title)))

    model = ChatOllama(model="llama3.2", temperature=1)

    #initialize model

    query = "Generate "+str(len(get_des(title)))+" task statements that "+ title +" would perform at work."

    if system == None:
        prompt_template = ChatPromptTemplate([
            ("human","{input}")
            ]
        )
    else:
        prompt_template = ChatPromptTemplate([
            ("system", system),
            ("human","{input}")
            ]
        )

    if structure == False:
        llm = model

    if structure == True:
        llm = model.with_structured_output(schema=jsc, method="json_schema")

    prompt = prompt_template.invoke({"input": query, "title": title})
    response = llm.invoke(prompt)
    return response

In [17]:
t = test_gen("Conservation Scientists", structure=True)

In [18]:
t

{'occupation': 'Conservation Scientist',
 'tasks': ['1. Develop and implement habitat restoration plans to enhance biodiversity in degraded ecosystems.',
  '2. Collaborate with stakeholders to identify conservation priorities and develop strategic plans for species reintroduction programs.',
  '3. Conduct field surveys to monitor population trends, behavior, and habitat use of target species.',
  '4. Analyze and interpret data from remote sensing, GIS, and other technologies to assess ecosystem health and track changes over time.',
  '5. Develop and implement invertebrate conservation strategies, including insect sampling protocols and habitat management plans.',
  '6. Work with landowners, farmers, and policymakers to promote sustainable agriculture practices and reduce pesticide use.',
  '7. Design and conduct experiments to test the effectiveness of conservation techniques, such as species removal or relocation programs.',
  '8. Develop and maintain spatial models of ecosystem proce