In [None]:
import pandas as pd

# Read in the CSV files
df = pd.read_csv("/home/bphil/youtube/data_lake_start/dataset_indeed-scraper_2023-10-23_07-07-15.csv")
df.info()
df['salary']

In [None]:
df[df['salary'].notnull()]


In [None]:
df.reset_index()

In [None]:
# Bring in an LLM
from langchain.llms import Bedrock
from llama_index.llms import LangChainLLM
from llama_index.program import LLMTextCompletionProgram, DataFrame, OpenAIPydanticProgram
from llama_index.output_parsers import PydanticOutputParser
from pydantic import BaseModel

bedrock = Bedrock(model_id="anthropic.claude-instant-v1", model_kwargs={
    'max_tokens_to_sample': 8000
})

llm = LangChainLLM(llm=bedrock)


In [None]:
from typing import List


class Skill(BaseModel):
    """Correctly extracted skill from the job description"""
    name: str
    description: str

class Tech(BaseModel):
    """Correctly extracted tech from the job description"""
    name: str
    description: str

class Job(BaseModel):
    skills: List[Skill]
    techs: List[Tech]

program = LLMTextCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Job),
    llm=llm,
    prompt_template_str=(
        "Create the job for the following job description: {input_str}"
    ),
    verbose=True,
)


description_guesses = []
for index, row in df.iterrows():
    print(index)
    description = row['description']
    id = row['id']
    url = row['url']
    # Do some pydantic work
    try:
        response = program(input_str=description)
        file = open(f"pydantic_data_output_3/{id}_{index}.txt", "w", encoding="utf-8")
        file.write("Skills:\n\n")
        for skill in response.skills:
            file.write(f"{skill.name}: {skill.description}\n")
        file.write("Technologies:\n\n")
        for tech in response.techs:
            file.write(f"{tech.name}: {tech.description}\n")
        #file.write(f"\nURL:{url}\n")
        file.close()
        description_guesses.append(response)
    except(ValueError) as e:
        print(e)
    # Break for test
    

In [None]:
import os
big_text = ""

print(len(description_guesses))
path = "pydantic_data_output_3/"
dirs = os.listdir(path)
print(dirs)
for file in dirs:
    with open(path+file, 'r', encoding="utf-8") as f:
        text = f.read()
        big_text += text

In [None]:
print(big_text)

In [None]:
from pydantic import conlist

class TopTech(BaseModel):
    name: str
    references: conlist(str, min_items=1)

class TopSkill(BaseModel):
    name: str
    references: conlist(str, min_items=1)

class Top5(BaseModel):
    skills: conlist(TopSkill, min_items=5)
    techs: conlist(TopTech, min_items=5)

In [None]:
program_top_five = LLMTextCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Top5),
    llm=llm,
    prompt_template_str=(
        "Create the Top 5 skills and top 5 technologies with the references that make them top 5 ranked by how many times they appear, from the following job descriptions: {input_str}"
    ),
    verbose=True,
)

response_2 = program_top_five(input_str=big_text)

In [None]:

file = open(f"{path}/top_five/top5.txt", "w", encoding="utf-8")
file.write("Skills:\n")
for skill in response_2.skills:
    file.write(f"{skill.name}\n")
    print(skill)

file.write("\n\nTechnologies:\n")
for tech in response_2.techs:
    file.write(f"{tech.name}\n")
    print(tech)
file.close()

In [None]:
response_3 = program_top_five(input_str=big_text)
for skill in response_3.skills:
    print(skill)

for tech in response_3.techs:
    print(tech)


In [None]:
# Redfine structure
class TopTech(BaseModel):
    name: str
    classification: str

class TopSkill(BaseModel):
    name: str
    classification: str

class Top5(BaseModel):
    skills: conlist(TopSkill, min_items=5)
    techs: conlist(TopTech, min_items=5)

In [None]:
program_top_five_2 = LLMTextCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Top5),
    llm=llm,
    prompt_template_str=(
        "Create the Top 5 skills and top 5 technologies with the references that make them top 5 ranked by how many times they appear, from the following job descriptions: {input_str}"
    ),
    verbose=True,
)

In [None]:
# Test on other directory
big_text = ""
path = "skills-res-v6-dataengineer-day-5/"
dirs = os.listdir(path)
print(dirs)
for file in dirs:
    if os.path.isdir(path+"/"+file):
        continue
    with open(path+"/"+file, 'r', encoding="utf-8") as f:
        text = f.read()
        big_text += text

In [None]:
response_4 = program_top_five_2(input_str=big_text)

In [None]:
response_4 = program_top_five_2(input_str=big_text)
for skill in response_4.skills:
    print(skill)

for tech in response_4.techs:
    print(tech)


In [None]:
for i in range(5):
    response_5 = program_top_five_2(input_str=big_text)
    file = open(f"{path}/top_five/top5_4_{i}.txt", "w", encoding="utf-8")
    file.write("Skills:\n")
    for skill in response_5.skills:
        file.write(f"{skill.name}\n")
        print(skill)

    file.write("\n\nTechnologies:\n")
    for tech in response_5.techs:
        file.write(f"{tech.name}\n")
        print(tech)
    file.close()

In [None]:
for i in range(5):
    response_5 = program_top_five_2(input_str=big_text)
    file = open(f"{path}/top_five/top5_5_{i}.txt", "w", encoding="utf-8")
    file.write("Skills:\n")
    for skill in response_5.skills:
        file.write(f"{skill.name}: {skill.classification}\n")
        print(skill)

    file.write("\n\nTechnologies:\n")
    for tech in response_5.techs:
        file.write(f"{tech.name}: {tech.classification}\n")
        print(tech)

    file.close()

In [None]:
for dtype in a.dtypes.items():
    print(a)

In [None]:


class JobClass(BaseModel):
    key_responsibilities: str
    education: str
    key_requirements: str
    job_type: str
    experience: str



In [None]:
program_2 = LLMTextCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(JobClass),
    llm=llm,
    prompt_template_str=(
        "Please extract the following query into a structured data according"
        " to: {input_str}.Please extract both the set of column names and a"
        " set of rows."
    ),
    verbose=True,
)


description_guesses = []
for index, row in enumerate(df['description']):
    print(index)
    # Do some pydantic work
    response = program(input_str=row)
    # Break for test
    description_guesses.append(response)