In [1]:
system_prompt = "You are a helpful information extraction system."
skill_template = """Given a job description, your task is to extract all entities and identify their entity types. 
Extract specific entities skills (skills could be knowledge area, tools, technology or personal abilities), job title, education qualification,years of experience,location. 
The output should be in a list of tuples of the following format: [("entity 1", "type of entity 1"), ... ].
Job Description: {job_desc}"""

salary_template = '''Given a job description, extract salary ranges by location, type of compensation it could be base or total or NA. 
In the format [("salary range 1", "location 1", "type of compensation 1"), ...].
Job Description: {job_desc}'''

In [57]:
import ast
import asyncio
import os
import glob

from dotenv import load_dotenv
import openai
import pandas as pd
from tqdm import tqdm
import warnings

import langchain
from langchain.cache import SQLiteCache
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import BaseOutputParser


# set llm environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_KEY")

# setting up cache
langchain.llm_cache = SQLiteCache(database_path="data/langchain_openai.db")

# set jupyter env variables
tqdm.pandas()
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [3]:
train = pd.read_csv("data\job_description_train.csv")
test = pd.read_csv("data\job_description_test.csv")
print(train.shape)
print(test.shape)

(1020, 11)
(180, 11)


In [4]:
skill_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", skill_template)
])
skill_chain = skill_prompt | ChatOpenAI(temperature=0)
skill_chain.invoke({"job_desc": train.Description[0]}).content

'[("Principal Distributed Systems Engineer", "job title"), \n("NVIDIA", "location"), \n("AI", "skills"), \n("deep learning", "skills"), \n("software 2.0 cloud platform", "skills"), \n("autonomous vehicles", "skills"), \n("robotics", "skills"), \n("virtual reality", "skills"), \n("healthcare", "skills"), \n("genomics", "skills"), \n("data science", "skills"), \n("Architect", "skills"), \n("distributed services", "skills"), \n("AI infrastructure", "skills"), \n("deep learning platforms", "skills"), \n("infrastructure", "skills"), \n("microservices", "skills"), \n("PB sized deep learning datasets", "skills"), \n("dataset management services", "skills"), \n("real and synthetic / simulated datasets", "skills"), \n("smart data selection", "skills"), \n("machine learning", "skills"), \n("technical leader", "skills"), \n("AI product teams", "skills"), \n("data and compute requirements", "skills"), \n("support infrastructure", "skills"), \n("AI applied researchers", "skills"), \n("future-proof 

In [5]:
salary_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", salary_template)
])
salary_chain = salary_prompt | ChatOpenAI(temperature=0)
salary_chain.invoke({"job_desc": train.Description[0]}).content

'[("$268,000 - $414,000", "NA", "base"), ("NA", "NA", "total")]'

In [10]:
def extract_jd_info(jd, skill_chain=skill_chain, salary_chain=salary_chain):
    skill_output = skill_chain.invoke({'job_desc': jd}).content
    salary_output = salary_chain.invoke({'job_desc': jd}).content
    return skill_output, salary_output

In [13]:
extract_jd_info(train.Description[1])

('[("Senior Engineer", "job title"), ("Java", "skills"), ("J2EE", "skills"), ("Oracle", "skills"), ("MongoDB", "skills"), ("Gradle", "skills"), ("REST API", "skills"), ("Spring MVC", "skills"), ("DevOps", "skills"), ("CI/CD", "skills"), ("Docker", "skills"), ("GitHub", "skills"), ("Maven", "skills"), ("IntelliJ IDEA", "skills"), ("Windows Server", "skills"), ("Unix Servers", "skills"), ("Unix scripting", "skills"), ("SQL", "skills"), ("Minneapolis-area Target office", "location"), ("Bachelor’s degree in Computer Science, Computer Engineering, Information Technology, or a closely related field", "education qualification"), ("five (5) years of post-baccalaureate, progressive experience in systems administration/infrastructure support", "years of experience"), ("Java, Oracle, Windows Server, Unix Servers, Unix scripting, and SQL", "skills"), ("Minneapolis-area Target office", "location")]',
 '[("$130,998.00 - $150,800.00", "Brooklyn Park, MN 55445", "NA")]')

In [75]:
train_prev = pd.concat([pd.read_csv(i) for i in glob.glob("data/prediction/*.csv")])
print(train_prev.shape)
ids = train_prev["Job ID"].unique().tolist()
print(len(ids))

(387, 14)
387


In [76]:
train_int = train[~train["Job ID"].isin(ids)].sample(200, random_state=100).reset_index(drop=True)

In [77]:
train_int["openai_output"] = train_int.Description.progress_apply(lambda text: extract_jd_info(text))

  6%|▌         | 11/200 [02:16<50:59, 16.19s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
100%|██████████| 200/200 [1:10:52<00:00, 21.26s/it] 


It takes close to 40 minutes to extract the desired information from job descriptions using OpenAI-3.5-turbo

In [78]:
def parse_openai_response(res):
    try:
        res = ast.literal_eval(res)
    except:
        res = []
    return res

def filter_responses(res):
    if len(res) >= 2:
        res_upd = [i for i in res if i != ('NA', 'NA', 'NA')]
    else:
        res_upd = res
    return res_upd

In [79]:
train_int["skills"] = train_int.openai_output.apply(lambda x: parse_openai_response(x[0]))
train_int["salary"] = train_int.openai_output.apply(lambda x: parse_openai_response(x[1]))
train_int["salary"] = train_int["salary"].apply(lambda x: filter_responses(x))

train_int = train_int[(train_int.skills.astype(str)!="[]")]
train_int = train_int[(train_int.salary.astype(str)!="[]")]

print(train_int.shape)

(193, 14)


In [80]:
train_int.to_csv("data/prediction/tagged_data_v4.csv", index=False)