In [23]:
#for loading data
import pandas as pd
import json

#for llm
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel

#logging
import pickle

#similarity
import regex as re
from sentence_transformers import SentenceTransformer
from scipy.optimize import linear_sum_assignment

#visualization
#plot matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load data

In [24]:
job_statements = pd.read_excel("datasets/task_statements.xlsx")
job_statements.columns = job_statements.columns.str.lower()
job_statements = job_statements.drop(labels=["incumbents responding","date","domain source"], axis=1).rename(columns={"o*net-soc code":"code", "task type":"type", "task id": "id", "task":"ref_task"})
job_statements = job_statements[~job_statements["type"].str.contains("Supplemental", case=False, na=True)]
job_statements["ind"] = job_statements["code"].str[:2]
job_statements = job_statements.groupby("title").agg({"ref_task":list, "ind": "first"}).reset_index().sort_values("ind")
sampled_occupation = job_statements.groupby('ind', group_keys=False).sample(frac=0.05, random_state=1) #43 samples
sampled_occupation

Unnamed: 0,title,ref_task,ind
478,Lodging Managers,[Answer inquiries pertaining to hotel policies...,11
777,Spa Managers,"[Respond to customer inquiries or complaints.,...",11
833,Training and Development Managers,[Analyze training needs to develop new trainin...,11
203,Customs Brokers,[Prepare and process import and export documen...,13
386,Government Property Inspectors and Investigators,"[Prepare correspondence, reports of inspection...",13
856,Video Game Designers,[Balance and adjust gameplay experiences to en...,15
559,Nanosystems Engineers,[Provide scientific or technical guidance or e...,17
430,Industrial Engineers,"[Estimate production costs, cost saving method...",17
799,Surveying and Mapping Technicians,"[Position and hold the vertical rods, or targe...",17
170,Conservation Scientists,[Apply principles of specialized fields of sci...,19


In [25]:
#for trial
trial_df = sampled_occupation.sample(5, random_state= 1)
test_sample_list =[trial_df.iloc[x]["title"] for x in range(5)]

### set up matching function

In [26]:
#get reference description
def get_des (occupation):
    task_list = sampled_occupation.query("title == @occupation")["ref_task"].iloc[0]
    return task_list

In [27]:
def task_gen(occupation,model):
    class task_description(BaseModel):
        count_statements: int
        statements: list[str]


    #initialize model
    model= model

    query = "Generate "+str(len(get_des(occupation)))+" task statements that "+ occupation +" would perform at work."

    prompt_template = ChatPromptTemplate([
        # ("system", "your role is a {name}. Respond like a {name}."),
        ("human","{input}")
        ]
    )
    structured_llm = model.with_structured_output(schema=task_description.model_json_schema())

    prompt = prompt_template.invoke({"occupation": occupation, "input": query})
    response = structured_llm.invoke(prompt)

    return response
    

In [29]:
# parse response
def parse_response(response):
    try:
        parsed = json.loads(response["statements"])
        return parsed
    except:
        try:
            parsed = response["statements"]
            return parsed
        except:
            return np.nan

In [30]:
#pre process text
def preProcessText(text=list):
	processed = []
	for doc in text:
		doc = re.sub(r"\\n", "", doc)
		doc = re.sub(r"\W", " ", doc) #remove non words char
		doc = re.sub(r"\d"," ", doc) #remove digits char
		doc = re.sub(r'\s+[a-z]\s+', " ", doc) # remove a single char
		doc = re.sub(r'^[a-z]\s+', "", doc) #remove a single character at the start of a document
		doc = re.sub(r'\s+', " ", doc)  #replace an extra space with a single space
		doc = re.sub(r'^\s', "", doc) # remove space at the start of a doc
		doc = re.sub(r'\s$', "", doc) # remove space at the end of a document
		processed.append(doc.lower())
	return processed

In [31]:
#get similarity score
def sbert(ref, gen):
    sim_model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name="cosine")

    # Compute embeddings for both lists
    embeddings_ref = sim_model.encode(ref)
    embeddings_gen = sim_model.encode(gen)


    # Compute cosine similarities
    similarities = sim_model.similarity(embeddings_ref, embeddings_gen).numpy()
    return similarities

In [32]:
#plot graph
def plot(similarities):
    x_label = [x+1 for x in range(len(similarities))]
    y_label = [chr(num + 96) for num in x_label]
    plt.figure(figsize=(12, 12))
    sns.heatmap(similarities, annot=True, fmt=".3f", cmap="crest", xticklabels=x_label, yticklabels=y_label)
    plt.title("Sentence Cosine Similarity for "+ test_occ)
    plt.xlabel("O*NET Data")
    plt.ylabel("Generated Sentence")
    plt.savefig("results/"+test_occ+".png")
    plt.show()
    return plt

In [33]:
def match(ref, gen):
    try:
        ref_clean = preProcessText(ref)
        gen_clean = preProcessText(gen)
        matrix = sbert(ref_clean, gen_clean)
        row_ind, col_ind = linear_sum_assignment(1 - matrix)  # Minimize cost (1 - similarity)
        assigned_similarities = matrix[row_ind, col_ind]
        return np.mean(assigned_similarities), matrix, row_ind.tolist(), col_ind.tolist()
    except:
        return np.nan

### packaging things for repeated excution

In [42]:
model = ChatOllama(model="mixtral", temperature=0.8)

In [43]:
shit = task_gen("Graphic Designers", model)
shit

ResponseError: registry.ollama.ai/library/mixtral:latest does not support tools (status code: 400)

In [35]:
for occu in test_sample_list:
    generated_statements = task_gen(occu, model)
    trial_df.loc[trial_df["title"] == occu, "gen_task"] = [generated_statements]

trial_df


Unnamed: 0,title,ref_task,ind,gen_task
203,Customs Brokers,[Prepare and process import and export documen...,13,
833,Training and Development Managers,[Analyze training needs to develop new trainin...,11,
178,"Cooks, Institution and Cafeteria",[Monitor and record food temperatures to ensur...,35,
607,"Painting, Coating, and Decorating Workers","[Apply coatings, such as paint, ink, or lacque...",51,
388,Graphic Designers,[Key information into computer equipment to cr...,27,


In [36]:
result_df = trial_df.reset_index(drop=True)
result_df["parsed_response"] = result_df["gen_task"].apply(parse_response)
result_df = result_df.dropna()
result_df

# result_df

Unnamed: 0,title,ref_task,ind,gen_task,parsed_response


In [37]:
result_df[["score", "matrix", "ref_order", "gen_order"]] = result_df.apply(lambda row: match(row["ref_task"], row["parsed_response"]), axis=1).apply(pd.Series)
result_df

ValueError: Columns must be same length as key

In [None]:
with open('/results/result.json', 'w') as f:
    f.write(result_df.to_json(index=True))

FileNotFoundError: [Errno 2] No such file or directory: '/results/result.json'

In [None]:
savejson = result_df.describe()
with open('/results/score.txt', 'w') as f:
    f.write(savejson.to_json(index=True))

NameError: name 'result_df' is not defined

In [None]:
def combine(doc):
    with open(doc) as file:
        df = pd.read_json(file).T
        return df

In [None]:
df_merged = pd.DataFrame()
for x in range(4):
    df = combine("output"+str(x)+".txt")
    df_merged = pd.concat([df_merged, df])
df_merged

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
score,5.0,0.437824,0.075287,0.380565,0.380905,0.391284,0.49502,0.541346
score,5.0,0.437824,0.075287,0.380565,0.380905,0.391284,0.49502,0.541346
score,5.0,0.437824,0.075287,0.380565,0.380905,0.391284,0.49502,0.541346
score,5.0,0.437824,0.075287,0.380565,0.380905,0.391284,0.49502,0.541346
