In [1]:
from llama_index import PromptTemplate
from llama_index.llms import Bedrock, ChatMessage
from llama_index.program import LLMTextCompletionProgram
from llama_index.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
import pandas as pd

In [2]:
# Need to have AWS creddentials loaded into environment
# This could be previously done in ~/.aws/credentials
bedrock = Bedrock(model="anthropic.claude-v2", max_tokens=8000)

In [6]:
# Load the raw data
job_df = pd.read_csv("../../data/bronze/apify/indeed/dataset_indeed-scraper_2023-10-23_07-07-15.csv")
descriptions = job_df['description'].to_list()

In [7]:
# Create the pydantic class to output into

class Skill(BaseModel):
    name: str
    reference_text: str

class Job(BaseModel):
    skills: List[Skill]

# Create the program to parse each description
job_program = LLMTextCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Job),
    llm=bedrock,
    prompt_template_str="Extract the Job skills, with the reference text to that skill, from the following description: {description}",
    verbose=True
)

In [8]:
test_output = job_program(description=descriptions[0])

In [9]:
for skill in test_output.skills:
    if skill.reference_text in descriptions[0]:
        print(skill)
    else:
        print(skill.name, "is not in the description")

name='ETL' reference_text='Design, develop, and maintain ETL pipelines on Google Cloud Platform (GCP) to ensure efficient data extraction, transformation, and loading processes.'
name='Google Cloud Platform' reference_text='Proven experience in designing and developing ETL pipelines on Google Cloud Platform (GCP).'


In [10]:
# Test out some of the New data
marco_csvs = pd.read_csv("../../data/jobs_Head_of_Product_-_European_Union_-_Remote_-_LinkedIn.csv")
descriptions = marco_csvs['job_description'].to_list()
print(descriptions[0])

About the job
            
 
About RevolutPeople deserve more from their money. More visibility, more control, more freedom. And since 2015, Revolut has been on a mission to deliver just that. With an arsenal of aweso﻿me products that span spending, saving, travel, transfers, investing, exchanging and more, our super app has helped 35+ million customers get more from their money. And we're not done yet.  As we continue our lightning-fast growth,‌ two things are essential to continuing our success: our people and our culture. We've been officially certified as a Great Place to Work™ in recognition of our outstanding employee experience! So far, we have 7,500+﻿ people working around the world, from our great offices or remotely, on our mission. And we're looking for more. We want brilliant people that love building great products, love redefining success, and love turning the complexity of a chaotic world into the simplicity of a beautiful solution.About The RoleOn the Revolut rocket shi

In [8]:
marco_csvs

Unnamed: 0,job_title,company_name,location,job_link,job_id,job_description
0,Head of Product (Crypto Exchange),Revolut,Ireland (Remote),https://www.linkedin.com/jobs/view/3736226364/...,3736226364,About the job\n \n \nAbout RevolutP...
1,Head of Product (w/m/d),ASI Reisen,"Tyrol, Austria (Remote)",https://www.linkedin.com/jobs/view/3759081878/...,3759081878,About the job\n \n \nASI Reisen ist...
2,Head of Product - Voluum,Team Internet,"Cracow, Małopolskie, Poland (Remote)",https://www.linkedin.com/jobs/view/3728862329/...,3728862329,About the job\n \n \nDepartment: Pr...
3,Senior Product Owner,ARRISE powering Pragmatic Play,European Union (Remote),https://www.linkedin.com/jobs/view/3721306074/...,3721306074,About the job\n \n \nDescriptionAbo...
4,CPO-B2C- Remoto 100%,Michael Page,"Madrid, Community of Madrid, Spain (Remote)",https://www.linkedin.com/jobs/view/3748364702/...,3748364702,About the job\n \n \nProyecto ambic...
5,Head of Product (f/d/m),Kooku Recruiting GmbH - Interim Recruiting & R...,Germany (Remote),https://www.linkedin.com/jobs/view/3774497203/...,3774497203,About the job\n \n \nAbout our Clie...
6,Head of Slots Product,Huuuge Games,Poland (Remote),https://www.linkedin.com/jobs/view/3713712362/...,3713712362,About the job\n \n \nIntroductionSt...
7,Senior Product Owner (all genders),PŸUR | Tele Columbus,Germany (Remote),https://www.linkedin.com/jobs/view/3744872668/...,3744872668,About the job\n \n \nÜber Tele Colu...
8,Domain Owner (Head of Product) - Web Experience,METRO.digital,Romania (Remote),https://www.linkedin.com/jobs/view/3755475139/...,3755475139,About the job\n \n \nAbout us: Pass...
9,Product Owner,Luxoft,Romania (Remote),https://www.linkedin.com/jobs/view/3777991849/...,3777991849,About the job\n \n \nProject Descri...


In [11]:
marco_test_output = job_program(description=descriptions[0])
print(marco_test_output.skills)

[Skill(name='Crypto Exchange Product Management', reference_text='Completely owning and building our Crypto Exchange product, increasing the number of tokens tradable in the application and providing the tools retail users expect from exchanges'), Skill(name='Team Leadership', reference_text="Setting your team's goals, success metrics, and roadmap to align with Revolut’s mission and drive maximum impact based on data analysis, market research, and company strategy"), Skill(name='Crypto Product Expansion', reference_text='Working with our Core Crypto team to expand and improve our suite of crypto products, including deposits, withdrawals, and staking, making them more accessible for our retail users'), Skill(name='User Experience Design', reference_text='Working closely with Design and UX Research to define the customer journey and create an amazing user experience'), Skill(name='Engineering Collaboration', reference_text='Liaising with Engineering to ensure effective delivery of the pr

In [12]:
for skill in marco_test_output.skills:
    if skill.reference_text in descriptions[0]:
        print(skill)
    else:
        print(skill.name, "is not in the description")

name='Crypto Exchange Product Management' reference_text='Completely owning and building our Crypto Exchange product, increasing the number of tokens tradable in the application and providing the tools retail users expect from exchanges'
name='Team Leadership' reference_text="Setting your team's goals, success metrics, and roadmap to align with Revolut’s mission and drive maximum impact based on data analysis, market research, and company strategy"
name='Crypto Product Expansion' reference_text='Working with our Core Crypto team to expand and improve our suite of crypto products, including deposits, withdrawals, and staking, making them more accessible for our retail users'
name='User Experience Design' reference_text='Working closely with Design and UX Research to define the customer journey and create an amazing user experience'
name='Engineering Collaboration' reference_text='Liaising with Engineering to ensure effective delivery of the product'
name='Stakeholder Collaboration' refe

In [13]:
skills_in_descriptions = []
failed = []
for index, description in enumerate(descriptions):
    try:
        output = job_program(description=description)
        index_description = {
            "index": index,
            "description": description,
            "skills": output
        }
        skills_in_descriptions.append(index_description)
        print(f"Completed {index}")
    except Exception as e:
        print(f"Error on {index} - {e}")
        index_description = {
            "index": index,
            "description": description
        }
        failed.append(index_description)
        continue

In [18]:
# Store the data that was just generated
# Made a mistake and need to dump the Job model
new_skills_dicts = []
for skill in skills_in_descriptions:
    new_dict = {
        "index": skill["index"],
        "description": skill["description"],
        "skills": [skill["skills"].model_dump()]
    }
    new_skills_dicts.append(new_dict)
    

In [34]:
# Assuming new_skills_dicts is a list of dictionaries containing the data
df = pd.DataFrame(new_skills_dicts)

In [31]:
df.to_json("skills_json_from_df.json", orient="records")

In [32]:
# Load the json file into a dataframe
# Use this to check the output
df = pd.read_json("skills_json_from_df.json")


In [51]:
skills = df['skills'].to_list()
descriptions_from_df = df['description'].to_list()

In [52]:
for index, skill in enumerate(skills):
    print(f"Job Index: {index}")
    print(descriptions_from_df[index])
    for skill_item in skill:
        for specific_skill in skill_item['skills']:
            print(specific_skill)
    print()

Job Index: 0
About the job
            
 
About RevolutPeople deserve more from their money. More visibility, more control, more freedom. And since 2015, Revolut has been on a mission to deliver just that. With an arsenal of aweso﻿me products that span spending, saving, travel, transfers, investing, exchanging and more, our super app has helped 35+ million customers get more from their money. And we're not done yet.  As we continue our lightning-fast growth,‌ two things are essential to continuing our success: our people and our culture. We've been officially certified as a Great Place to Work™ in recognition of our outstanding employee experience! So far, we have 7,500+﻿ people working around the world, from our great offices or remotely, on our mission. And we're looking for more. We want brilliant people that love building great products, love redefining success, and love turning the complexity of a chaotic world into the simplicity of a beautiful solution.About The RoleOn the Revol