In [1]:
#for loading data
import pandas as pd
import json
import requests

#for llm
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel

#counting
from tqdm import tqdm

#logging
import pickle
import regex as re
import os
from datetime import datetime

In [2]:
# Generate the folder name with current date and time
folder_name = 'results/job_match_'+datetime.now().strftime("%d%m_%H%M")

# Create the folder if it does not exist
os.makedirs(folder_name, exist_ok=True)

### Set up occupation data sample

In [57]:
# set up occupation data
occupations = pd.read_excel('datasets/occupation_data.xlsx').dropna()

occupations.columns = occupations.columns.str.lower()
#rename the column 
occupations = occupations.rename(columns={'o*net-soc code':'code'})   
#drop rows with "all other" in the content
occupations = occupations[~occupations['title'].str.contains("All Other")]
#change data type
occupations['code'] = occupations['code'].astype(str)
occupations['title'] = occupations['title'].astype(str)
occupations['description'] = occupations['description'].astype(str)

occupations["ind"] = occupations["code"].str[:2]
# discard rows with ind = 55
occupations = occupations[occupations['ind'] != '55']


# #get jobzone df
# jz = pd.read_excel('datasets/job_zone.xlsx')
# jz.columns = jz.columns.str.lower().str.replace(" ","_")
# jz = jz.drop(labels=['date', 'domain_source','o*net-soc_code'], axis=1)   
# jz = jz.rename(columns={'job_zone':'zone'})

#merge occupation and jobzone
# occupations = pd.merge(occupations, jz, on='title', how='left')
sampled_occupation = occupations.groupby('ind', group_keys=False).sample(frac=0.05, random_state=1).reset_index(drop=True) #47 samples
sampled_occupation

Unnamed: 0,code,title,description,ind
0,11-9141.00,"Property, Real Estate, and Community Associati...","Plan, direct, or coordinate the selling, buyin...",11
1,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",11
2,11-9161.00,Emergency Management Directors,Plan and direct disaster response or crisis ma...,11
3,13-1199.05,Sustainability Specialists,"Address organizational sustainability issues, ...",13
4,13-1082.00,Project Management Specialists,"Analyze and coordinate the schedule, timeline,...",13
5,15-2051.01,Business Intelligence Analysts,Produce financial and market intelligence by q...,15
6,15-1255.00,Web and Digital Interface Designers,Design digital user interfaces or websites. De...,15
7,17-2051.00,Civil Engineers,"Perform engineering duties in planning, design...",17
8,17-1022.00,Surveyors,Make exact measurements and determine property...,17
9,17-2141.00,Mechanical Engineers,Perform engineering duties in planning and des...,17


In [38]:
#count the number of samples per zone
sampled_occupation['zone'].value_counts()

zone
4    15
2    11
3    10
5     7
1     3
Name: count, dtype: int64

In [7]:
test_sample = sampled_occupation.sample(5, random_state=1)


In [8]:
occupation_dict = {sampled_occupation["title"].iloc[x]: sampled_occupation["code"].iloc[x]for x in range(len(sampled_occupation))}
test_dict = {test_sample["title"].iloc[x]: test_sample["code"].iloc[x]for x in range(len(test_sample))}
test_dict

{'First-Line Supervisors of Police and Detectives': '33-1012.00',
 'Cytotechnologists': '29-2011.02',
 'Butchers and Meat Cutters': '51-3021.00',
 'Elevator and Escalator Installers and Repairers': '47-4021.00',
 'Emergency Management Directors': '11-9161.00'}

Load Question bank

In [40]:
#get the questions into a list
with open("datasets/60qs.json") as f:
    qs = json.load(f)
test = qs["questions"]["question"]
df = pd.DataFrame(test)[['text', 'area', '_index']]
df.columns = ['question', 'area', 'index']
qlist = list(df["question"])

test_q = qlist[:5]
test_q

['Build kitchen cabinets',
 'Lay brick or tile',
 'Develop a new medicine',
 'Study ways to reduce water pollution',
 'Write books or plays']

### Set up related occupation data

In [44]:
#whole dataset
related = pd.read_excel('datasets/related_occupations.xlsx').astype(str)
related.columns = related.columns.str.lower().str.replace(" ","_").str.replace("o*net-soc_", "")
related = related[related["relatedness_tier"] == "Primary-Short"]
related

Unnamed: 0,code,title,related_code,related_title,relatedness_tier,index
0,11-1011.00,Chief Executives,11-1021.00,General and Operations Managers,Primary-Short,1
1,11-1011.00,Chief Executives,11-2032.00,Public Relations Managers,Primary-Short,2
2,11-1011.00,Chief Executives,11-9151.00,Social and Community Service Managers,Primary-Short,3
3,11-1011.00,Chief Executives,11-3031.01,Treasurers and Controllers,Primary-Short,4
4,11-1011.00,Chief Executives,11-9199.02,Compliance Managers,Primary-Short,5
...,...,...,...,...,...,...
18440,53-7121.00,"Tank Car, Truck, and Ship Loaders",53-7051.00,Industrial Truck and Tractor Operators,Primary-Short,1
18441,53-7121.00,"Tank Car, Truck, and Ship Loaders",53-7062.00,"Laborers and Freight, Stock, and Material Move...",Primary-Short,2
18442,53-7121.00,"Tank Car, Truck, and Ship Loaders",53-7011.00,Conveyor Operators and Tenders,Primary-Short,3
18443,53-7121.00,"Tank Car, Truck, and Ship Loaders",47-5044.00,"Loading and Moving Machine Operators, Undergro...",Primary-Short,4


### API call score function

In [2]:
#access the api to get the job titles
def get_career(answer):

    url = 'https://services.onetcenter.org/ws/mnm/interestprofiler/careers?answers='+answer+'&start=1&end=1000'
    cookies = {
        'developer_login': 'dW5pX21hbm5oZWltX2RlMTowMDU1ODEyOTFiYzRjYTYxNGE5YmJlM2E4ZjgyNjk2NWQxNzFiY2Y0',
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
        'Accept': 'application/json',
        'Accept-Language': 'en-US,en;q=0.5',
        'Authorization': 'Basic dW5pX21hbm5oZWltX2RlMTo3MzM5Y3R1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'cross-site',
        'Priority': 'u=0, i',
    }

    params = {
        'start': '1',
        'end': '60',
    }

    response = requests.get(
        url,
        params=params,
        cookies=cookies,
        headers=headers,
    )
    #search for the the target occupation in the response
    data = json.loads(response.text)["career"]
    #select only title and fit
    career = pd.DataFrame(data).drop(["href", "tags"], axis=1)
    return career 

In [42]:
#match suggested career to the related career
def matching_score(code, career):
    #get direct match
    best_fit = career[career['fit'] == 'Best']
    if sum(code == best_fit["code"]) == 1:
        print(1)
    else:
        #get related match
        related_occ = related.query('code == @code & (relatedness_tier == "Primary-Short")')
        related_occ = related_occ.merge(best_fit,how="inner", left_on= "related_code", right_on="code")
        print (len(related_occ)/5)
        return related_occ

### set up LLM

In [50]:
def basic_rating(model, system):
    model= ChatOllama(model="llama3.1", temperature=1)

    chat_history = []
    task = """rating the statement from 1 to 5 depending on your interest. reply with the follow valid json format: {'reason': "A short sentence explaining the reasoning","answer": <number between 1-5>}"""

    prompt_template = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "you are an experienced {name}. follow the instruction given to you, but think and feel like a {name}'",
            ),
            MessagesPlaceholder(variable_name="chat_history"),
            (
                "human",
                "{input}"
            )
        ]
    )

    for q in test_q:
        prompt = prompt_template.invoke({"name": "Clinical and Counseling Psychologists", "chat_history" : chat_history[-10:], "input": task + q})
        response = model.invoke(prompt)
        chat_history.append(HumanMessage(content=q))
        chat_history.append(response.content)

In [None]:
def get_rating(title, model, code, system = None ):
    class rating(BaseModel):
            do_you_like_it: str
            provide_thoughts: str
            your_rating: int

    chat_history = []
    to_parse=[]
    query = "think about your persona and rate the statement with a number between 1 to 5 depending on your interest. 1 is strongly dislike and 5 is strongly like."

    prompt_template = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "you are an experienced {name}. follow the instruction given to you, but think and feel like a {name}'",
            ),
            MessagesPlaceholder(variable_name="chat_history"),
            (
                "human",
                "{input}"
            )
        ]
    )

    structured_llm = model.with_structured_output(schema=rating.model_json_schema())
    for q in tqdm(qlist):
        prompt = prompt_template.invoke({"name": title, "chat_history" : chat_history[-10:], "input": query + q})
        response = structured_llm.invoke(prompt)
        chat_history.append(HumanMessage(content=q))
        chat_history.append(AIMessage(str(response)))
        to_parse.append(response["your_rating"])
    
    
    with open(folder_name + '/' + code + '.pkl', 'wb') as f:
        pickle.dump(chat_history, f)
    answer = ''.join(map(str, to_parse))
    return answer

In [41]:
def get_test_rating(title, model, code, system = None ):
    class rating(BaseModel):
            do_you_like_it: str
            provide_thoughts: str
            your_rating: int

    chat_history = []
    to_parse=[]
    query = "think about your persona and rate the statement with a number between 1 to 5 depending on your interest. 1 is strongly dislike and 5 is strongly like."

    prompt_template = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "you are an experienced {name}. follow the instruction given to you, but think and feel like a {name}'",
            ),
            MessagesPlaceholder(variable_name="chat_history"),
            (
                "human",
                "{input}"
            )
        ]
    )

    structured_llm = model.with_structured_output(schema=rating.model_json_schema())
    for q in tqdm(test_q):
        prompt = prompt_template.invoke({"name": title, "chat_history" : chat_history[-10:], "input": query + q})
        response = structured_llm.invoke(prompt)
        chat_history.append(HumanMessage(content=q))
        chat_history.append(AIMessage(str(response)))
        to_parse.append(response["your_rating"])
    
    
    with open(folder_name + '/' + code + '.pkl', 'wb') as f:
        pickle.dump(chat_history, f)
    answer = ''.join(map(str, to_parse))
    return answer

In [38]:
llm_test = ChatOllama(model="llama3.1", temperature=1)

In [47]:
a ="421122121112111115115114111112111111111155111411411134124541" #models
"422452221155111111511511111514511151515121551351454111111341" #chat_emergency_management_directors
"412512142234422221411211511115411441411151111441511111111114" #chat_team_assemblers
"515511151111155111511111115513511351515511145151555511211511" #chat_environmental_science_and_protection_technicians_including_health
"411111111414111111511111111111111151511111111151511111111111" #chat_butchers_and_meat_cutters
"411511151152111111421511111114152252524541351451514111111121" #chat_project_management_specialists
"511111111414511111511411111115115541531111111151511111141111" #chat_elevator_and_escalator_installers_and_repairers
"411511111122111111511111111314531151514111211151555111111121" #chat_sustainability_specialists
"511411111151511511511511511214511451544151111151515111115411" #chat_mechanical_engineers
"511411111111111111511411111311111251113111112144421222232221" #chat_surveyors


'511411111111111111511411111311111251113111112144421222232221'

### answer extraction

### Job matching

In [73]:
#call api and get career suggestions
career = get_career(a)
best_fit = career[career["fit"] == "Best"]
best_fit

Unnamed: 0,fit,code,title
0,Best,27-2011.00,Actors
1,Best,27-1011.00,Art Directors
2,Best,27-3011.00,Broadcast Announcers & Radio Disc Jockeys
3,Best,27-2032.00,Choreographers
4,Best,39-3092.00,Costume Attendants
5,Best,27-2091.00,Disc Jockeys
6,Best,27-1022.00,Fashion Designers
7,Best,27-4032.00,Film & Video Editors
8,Best,27-1024.00,Graphic Designers
9,Best,27-1025.00,Interior Designers


In [None]:
for title, code in test_dict.items():
    answer = get_test_rating(title, llm_test, code)
    career = get_career(answer)

    with open(folder_name + '/' + code + '.json', 'w') as f:
        f.write(career.to_json(index=True))


0.0
0.0
0.0
0.0
0.0


In [3]:
x = get_career("332333112213111111431313111111313235133113511133213131331341")

In [None]:
x.

Unnamed: 0,fit,code,title
0,Best,43-3011.00,Bill & Account Collectors
1,Best,41-2011.00,Cashiers
2,Best,39-6012.00,Concierges
3,Best,43-4021.00,Correspondence Clerks
4,Best,41-2021.00,Counter & Rental Clerks
...,...,...,...
103,Good,35-3023.00,Fast Food & Counter Workers
104,Good,45-2041.00,"Graders & Sorters, Agricultural Products"
105,Good,25-9031.00,Instructional Coordinators
106,Good,37-2012.00,Maids & Housekeeping Cleaners
