In [1]:
#for loading data
import pandas as pd
import json
import requests

#for llm
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

#counting
from tqdm import tqdm

#logging
import pickle

### Set up occupation data sample

In [None]:
#set up industry data
industries = pd.read_excel('datasets/industry.xlsx').astype(str)
industries['industry'] = industries["industry"].str.replace("Occupations", "")
# set up occupation data
occupations = pd.read_excel('datasets/occupation_data.xlsx')
occupations = occupations.dropna()
occupations.columns = occupations.columns.str.lower()
#rename the column 
occupations = occupations.rename(columns={'o*net-soc code':'code'})   
#drop rows with "all other" in the content
occupations = occupations[~occupations['title'].str.contains("All Other")]
#change data type
occupations['code'] = occupations['code'].astype(str)
occupations['title'] = occupations['title'].astype(str)
occupations['description'] = occupations['description'].astype(str)
#match industry code with industry name from induestries
occupations["industry"] = occupations['code'].apply(lambda x:x[0:2]).map(industries.set_index("industry_code")["industry"])
occupations.dropna().reset_index().drop(["index"], axis=1)
#stratified sampling by industry
sampled_occupation = occupations.groupby('industry', group_keys=False).sample(frac=0.05, random_state=1)
test_sample = sampled_occupation.sample(5, random_state= 2)
i = 1
print(test_sample.iloc[i].code, test_sample.iloc[i].title)

47-5081.00 Helpers--Extraction Workers


In [3]:
for i in range(len(test_sample)):
    print(test_sample.iloc[i].title)

Manufactured Building and Mobile Home Installers
Helpers--Extraction Workers
Intelligence Analysts
Fishing and Hunting Workers
Animal Caretakers


In [14]:
#single title
code = test_sample.iloc[0].code #"19-3033.00"
title = test_sample.iloc[0].title
title

'Manufactured Building and Mobile Home Installers'

In [5]:
#multiple
code_l = list(test_sample.code)
title_l = list(test_sample.title)

Load Question bank

In [6]:
#get the questions into a list
with open("datasets/60qs.json") as f:
    qs = json.load(f)
test = qs["questions"]["question"]
df = pd.DataFrame(test)[['text', 'area', '_index']]
df.columns = ['question', 'area', 'index']
qlist = list(df["question"])

### Set up related occupation data

In [7]:
#whole dataset
related = pd.read_excel('datasets/related_occupations.xlsx').astype(str)
related.columns = related.columns.str.lower()
related = related.rename(columns={'o*net-soc code':'code','related o*net-soc code': 'related_code' })
related.columns = related.columns.str.replace(" ","_")
related

Unnamed: 0,code,title,related_code,related_title,relatedness_tier,index
0,11-1011.00,Chief Executives,11-1021.00,General and Operations Managers,Primary-Short,1
1,11-1011.00,Chief Executives,11-2032.00,Public Relations Managers,Primary-Short,2
2,11-1011.00,Chief Executives,11-9151.00,Social and Community Service Managers,Primary-Short,3
3,11-1011.00,Chief Executives,11-3031.01,Treasurers and Controllers,Primary-Short,4
4,11-1011.00,Chief Executives,11-9199.02,Compliance Managers,Primary-Short,5
...,...,...,...,...,...,...
18455,53-7121.00,"Tank Car, Truck, and Ship Loaders",51-9012.00,"Separating, Filtering, Clarifying, Precipitati...",Supplemental,16
18456,53-7121.00,"Tank Car, Truck, and Ship Loaders",53-7071.00,Gas Compressor and Gas Pumping Station Operators,Supplemental,17
18457,53-7121.00,"Tank Car, Truck, and Ship Loaders",51-8091.00,Chemical Plant and System Operators,Supplemental,18
18458,53-7121.00,"Tank Car, Truck, and Ship Loaders",53-7062.04,Recycling and Reclamation Workers,Supplemental,19


### API call score function

In [8]:
#access the api to get the job titles
def get_career(answer):

    url = 'https://services.onetcenter.org/ws/mnm/interestprofiler/careers?answers='+answer+'&start=1&end=1000'
    cookies = {
        'developer_login': 'dW5pX21hbm5oZWltX2RlMTowMDU1ODEyOTFiYzRjYTYxNGE5YmJlM2E4ZjgyNjk2NWQxNzFiY2Y0',
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
        'Accept': 'application/json',
        'Accept-Language': 'en-US,en;q=0.5',
        'Authorization': 'Basic dW5pX21hbm5oZWltX2RlMTo3MzM5Y3R1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'cross-site',
        'Priority': 'u=0, i',
    }

    params = {
        'start': '1',
        'end': '60',
    }

    response = requests.get(
        url,
        params=params,
        cookies=cookies,
        headers=headers,
    )
    #search for the the target occupation in the response
    data = json.loads(response.text)["career"]
    #select only title and fit
    career = pd.DataFrame(data).drop(["href", "tags"], axis=1)
    return career 

In [9]:
#match suggested career to the related career
def matching_score(code, career):
    #get direct match
    best_fit = career[career['fit'] == 'Best']
    if sum(code == best_fit["code"]) == 1:
        print(1)
    else:
        #get related match
        related_occ = related.query('code == @code & (relatedness_tier == "Primary-Short" | relatedness_tier == "Primary-Long")')
        related_occ = related_occ.merge(career,how="inner", left_on= "related_code", right_on="code")
        print (len(related_occ)/10)
        return related_occ

### set up LLM

In [16]:
#small test
small_list = qlist[:5]


#json
json_schema ={
  "type": "object",
  "title": "interest",
  "description": "reasoning and rating of an items. reason first, then rate",
  "properties": {
    "interest": {
      "type": "integer",
      "description": "final rating, from 1(strongly dislike) to 5(strongly like)"
    },
    "_reasoning": {
      "type": "string",
      "description": "The thinking and reasoning"
    }

  },
  "required": ["_reasoning", "interest"]
}

#initialize model
model= ChatOllama(model="llama3.1", temperature=1, format= 'json')
#model= ChatOllama(model="llama3.1", temperature=1)

chat_history = []
answer = []
task = """rate the follow statement from 1 (strongly dislike) to 5 (strongly like) depending on your interest: """

prompt_template = ChatPromptTemplate([
        ("system", "your role is a {name}. Think, feel, perceive, judge, behave, observe, respond, and act like a {name}.start you thinking with 'as a {name}'..."),
        ("placeholder", "{chat_history}"),
        ("human","{input}")
        ]
)
structured_llm = model.with_structured_output(schema=json_schema)

for q in tqdm(small_list):
  prompt = prompt_template.invoke({"name": title, "chat_history" : chat_history[-10:], "input": task +q})
  response = structured_llm.invoke(prompt)
  # response = model.invoke(prompt)
  chat_history.append(HumanMessage(content=q))
  chat_history.append(AIMessage(str(response)))
  # answer.append(response["interest"])
# processed = "".join(str(answer[i])for i in range(len(answer)))
chat_history



100%|██████████| 5/5 [00:29<00:00,  5.99s/it]


[HumanMessage(content='Build kitchen cabinets', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'Building kitchen cabinets aligns with my profession as a Manufactured Building and Mobile Home Installers, providing an opportunity to utilize my skills in carpentry.', 'interest': 4}", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Lay brick or tile', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'Laying brick or tile aligns with my profession as a Manufactured Building and Mobile Home Installers, providing an opportunity to utilize my skills in masonry.', 'interest': 4}", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Develop a new medicine', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'Developing a new medicine is not aligned with my profession as a Manufactured Building and Mobile Home Installers.', 'interest': 1}", additional_kwargs={}, respons

In [9]:
def sim_ans():
    import random
    num = [random.randint(1,5) for x in range(60)]
    num_s = "".join(str(num[i])for i in range(60))
    return num_s

In [19]:
for code in code_l:
    print(matching_score(code, get_career(sim_ans())))

0.0
Empty DataFrame
Columns: [code_x, title_x, related_code, related_title, relatedness_tier, index, fit, code_y, title_y]
Index: []
0.0
Empty DataFrame
Columns: [code_x, title_x, related_code, related_title, relatedness_tier, index, fit, code_y, title_y]
Index: []
0.0
Empty DataFrame
Columns: [code_x, title_x, related_code, related_title, relatedness_tier, index, fit, code_y, title_y]
Index: []
1
None
1
None


In [35]:
with open("chatSales Representatives of Services, Except Advertising, Insurance, Financial Services, and Travel.pkl", "rb") as file:
    data = pickle.load(file)

jstr = data[1].content.replace("'",'"')
dd = json.loads(jstr)["_reasoning"]
dd

'Build kitchen cabinets. A very useful project for home renovation.'

In [36]:
data

[HumanMessage(content='Build kitchen cabinets', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'Build kitchen cabinets. A very useful project for home renovation.', 'interest': '4'}", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Lay brick or tile', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'Lay brick or tile. A very useful project for construction.', 'interest': '2'}", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Develop a new medicine', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'Develop a new medicine. A very useful project for health and wellness.', 'interest': '4'}", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Study ways to reduce water pollution', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'Study ways to reduce water pollution. A very useful project for environmental co

In [18]:
def askllm(title):
  #json
  json_schema ={
    "type": "object",
    "title": "interest",
    "description": "reasoning and rating of an items. reason first, then rate",
    "properties": {
      "interest": {
        "type": "integer",
        "description": "final rating, from 1(strongly dislike) to 5(strongly like)"
      },
      "_reasoning": {
        "type": "string",
        "description": "The thinking and reasoning"
      }

    },
    "required": ["_reasoning", "interest"]
  }

  model= ChatOllama(model="llama3.1", temperature=1, format= 'json')

  chat_history = []
  answer = []
  task = """rate the follow statement from 1 (strongly dislike) to 5 (strongly like) depending on your interest: """

  prompt_template = ChatPromptTemplate([
          ("system", "your role is a {name}. Think, feel, perceive, judge, behave, observe, respond, and act like a {name}. Start you thinking with 'as a {name}'..."),
          ("placeholder", "{chat_history}"),
          ("human","{input}")
          ]
  )
  structured_llm = model.with_structured_output(schema=json_schema)

  for q in tqdm(qlist):
    prompt = prompt_template.invoke({"name": title, "chat_history" : chat_history[-10:], "input": task +q})
    response = structured_llm.invoke(prompt)
    chat_history.append(HumanMessage(content=q))
    chat_history.append(AIMessage(str(response)))
    answer.append(response["interest"])
  processed = "".join(str(answer[i])for i in range(len(answer)))
  with open("chat"+title+".pkl", "wb") as file:
    pickle.dump((chat_history), file)
  print(chat_history)
  return processed



In [19]:
ans_list = []
for i in range(len(test_sample)):
    
    anss = askllm(test_sample.iloc[i].title)
    ans_list.append(anss)
ans_list

100%|██████████| 60/60 [06:41<00:00,  6.68s/it]


[HumanMessage(content='Build kitchen cabinets', additional_kwargs={}, response_metadata={}), AIMessage(content="{'_reasoning': 'Build kitchen cabinets is a fun and creative project that requires some skill and knowledge of carpentry, but the end result is well worth it.', 'interest': 4}", additional_kwargs={}, response_metadata={}), HumanMessage(content='Lay brick or tile', additional_kwargs={}, response_metadata={}), AIMessage(content="{'_reasoning': 'As a Manufactured Building and Mobile Home Installers, laying brick or tile is not part of my expertise, I prefer building and installing manufactured buildings and mobile homes.', 'interest': 1}", additional_kwargs={}, response_metadata={}), HumanMessage(content='Develop a new medicine', additional_kwargs={}, response_metadata={}), AIMessage(content="{'_reasoning': 'Developing a new medicine is not part of my expertise as a Manufactured Building and Mobile Home Installers.', 'interest': 1}", additional_kwargs={}, response_metadata={}), 

 27%|██▋       | 16/60 [01:48<04:58,  6.80s/it]


KeyboardInterrupt: 

In [23]:
career = get_career(ans_list[0])

In [24]:
career.head(50)

Unnamed: 0,fit,code,title
0,Best,51-9191.00,Adhesive Bonding Machine Operators & Tenders
1,Best,45-2091.00,Agricultural Equipment Operators
2,Best,49-3011.00,Aircraft Mechanics & Service Technicians
3,Best,53-6032.00,Aircraft Service Attendants
4,Best,51-2011.00,"Aircraft Structure, Surfaces, Rigging, & Syste..."
5,Best,53-3011.00,Ambulance Drivers & Attendants
6,Best,45-2021.00,Animal Breeders
7,Best,39-2021.00,Animal Caretakers
8,Best,33-9011.00,Animal Control Workers
9,Best,39-2011.00,Animal Trainers


In [44]:
history = [result[0][1::2][i].content for i in range(len(result[0][1::2]))]
history

["{'_reasoning': 'Build kitchen cabinets is a rewarding DIY project that allows you to create custom storage solutions for your home.', 'interest': 4}",
 "{'_reasoning': 'Lay brick or tile', 'interest': 2}",
 '{\'_reasoning\': "Develop a new medicine is a groundbreaking task that allows you to improve people\'s health and quality of life.", \'interest\': 4}',
 "{'_reasoning': 'Study ways to reduce water pollution is a meaningful task that allows you to contribute to the well-being of our planet and its inhabitants.', 'interest': 5}",
 "{'_reasoning': 'Write books or plays is a creative and expressive task that allows you to convey stories and ideas.', 'interest': 4}"]

In [10]:
# Initialize the model
model = ChatOllama(model="llama3.1", temperature=0)

# Chat history as a list of message objects
chat_history = []  # This would contain AIMessage and HumanMessage objects

# Define the task
task = """Rate the statement from 1 to 5 depending on your interest."""

# Create the prompt template using MessageLikeRepresentation
messages = [
    {
        "role": "system",
        "content": "You are an experienced {name}. Follow the instruction given to you, but think and behave like a {name}. Your response must be valid JSON matching the provided schema."
    },
    {
        "role": "user",
        "content": "{input}"
    },
    {
        "role": "placeholder",
        "content": "{chat_history}"
    }
]

prompt_template = ChatPromptTemplate.from_messages(messages)

# Create structured output model
structured_llm = model.with_structured_output(json_schema)

for q in small_list:
    # Generate the prompt
    prompt = prompt_template.format_messages(
        name=title,
        input=task +  q,
        chat_history = chat_history
    )

    # Get the response
    response = structured_llm.invoke(prompt)

    # If you need to add to chat history
    chat_history.append(HumanMessage(content=q))
    chat_history.append(AIMessage(content=str(response)))

NameError: name 'json_schema' is not defined

In [62]:
chat_history

[HumanMessage(content='Build kitchen cabinets', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'The task of building kitchen cabinets is moderately challenging, requiring precision and attention to detail. It involves measuring, cutting, assembling, and finishing various components.', 'interest': 4}", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Lay brick or tile', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'The task of laying brick or tile is challenging, requiring manual dexterity and attention to detail. It involves measuring, cutting, and placing tiles or bricks with precision.', 'interest': 3}", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Develop a new medicine', additional_kwargs={}, response_metadata={}),
 AIMessage(content="{'_reasoning': 'The task of developing a new medicine is challenging, requiring advanced knowledge and skills in various fields such as chemi

In [None]:
model= ChatOllama(model="llama3.1", temperature=1)

chat_history = []
task = """rating the statement from 1 to 5 depending on your interest. reply with the follow valid json format: {'reason': "A short sentence explaining the reasoning","answer": <number between 1-5>}"""

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "you are an experienced {name}. follow the instruction given to you, but think and feel like a {name}'",
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        (
            "human",
            "{input}"
        )
    ]
)

for q in qlist:
    prompt = prompt_template.invoke({"name": "Clinical and Counseling Psychologists", "chat_history" : chat_history[-10:], "input": task + q})
    response = model.invoke(prompt)
    chat_history.append(HumanMessage(content=q))
    chat_history.append(response)

In [61]:
chat_history

[HumanMessage(content='Build kitchen cabinets', additional_kwargs={}, response_metadata={}),
 "{'reason': 'This is not a psychological topic, more of a practical/home improvement task.', 'answer': 1}",
 HumanMessage(content='Lay brick or tile', additional_kwargs={}, response_metadata={}),
 "{'reason': 'Not aligned with my training as a Clinical and Counseling Psychologist', 'answer': 2}",
 HumanMessage(content='Develop a new medicine', additional_kwargs={}, response_metadata={}),
 "{'reason': 'This aligns closely with my training in Clinical Psychology, potentially involving understanding of human health and well-being.', 'answer': 4}",
 HumanMessage(content='Study ways to reduce water pollution', additional_kwargs={}, response_metadata={}),
 '{\'reason\': "Aligns closely with my training in Clinical Psychology, potentially involving understanding of human health and well-being, as it also affects psychological well-being.", \'answer\': 4}',
 HumanMessage(content='Write books or plays'

In [88]:
ans =  [item[-4:-1].replace(" ","").replace(":","") for item in chat_history[1::2]]
answer = "".join(ans)


### answer extraction

### Job matching

In [92]:
#call api and get career suggestions
career = get_career(answer)
career

Unnamed: 0,fit,code,title
0,Best,25-3011.00,"Adult Basic Education, Adult Secondary Educati..."
1,Best,25-1031.00,"Architecture Teachers, Postsecondary"
2,Best,25-1062.00,"Area, Ethnic, & Cultural Studies Teachers, Pos..."
3,Best,29-1129.01,Art Therapists
4,Best,25-1121.00,"Art, Drama, & Music Teachers, Postsecondary"
...,...,...,...
78,Good,31-1122.00,Personal Care Aides
79,Good,31-2022.00,Physical Therapist Aides
80,Good,31-1133.00,Psychiatric Aides
81,Good,33-9094.00,School Bus Monitors


In [100]:
#get score: #of match titles / total # of related occupation(10) 
score = matching_score(code, career)
score

1
