In [None]:
%pip install boto3 pandas s3fs

In [91]:
import io
import os
import boto3
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

AWS_S3_BUCKET = os.getenv('AWS_S3_BUCKET')
s3_client = boto3.client("s3")
print(AWS_S3_BUCKET)

raw-apify-datasets-lake


In [92]:
response = s3_client.list_objects(Bucket=AWS_S3_BUCKET, Prefix="bronze/raw_indeed_jobs/dataset")
contents = response["Contents"]
datasets = []
for item in contents:
    datasets.append(item["Key"])
    print(item["Key"])
print(datasets)

bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-18_13-39-59-697.csv
bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-19_14-08-42-423.csv
bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-20_12-42-20-486.csv
bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-21_13-12-16-991.csv
['bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-18_13-39-59-697.csv', 'bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-19_14-08-42-423.csv', 'bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-20_12-42-20-486.csv', 'bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-21_13-12-16-991.csv']


In [7]:
# Get the data from bronze bucket and store it in array to be worked on
dataframes = []
for data in datasets:
    job_object = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=data)
    status = job_object.get("ResponseMetadata", {}).get("HTTPStatusCode")
    jobs_df = pd.core.frame.DataFrame
    if status == 200:
        print("Successfully got desired object")
        jobs_df = pd.read_csv(job_object.get("Body"))
        dataframes.append(jobs_df)
    else:
        print("Unable to get object, something wrong")



Successfully got desired object
Successfully got desired object
Successfully got desired object
Successfully got desired object


In [9]:
salary_filtered_frames = []
for frame in dataframes:
    filtered_salary = frame['salary'].notnull()
    filtered_curr = frame[filtered_salary]
    salary_filtered_frames.append(filtered_curr)
len(salary_filtered_frames)

4

In [None]:
# Combine the dataframes
# Final frame should represent correct number of uniques
merged_frame = pd.DataFrame
for index, frame in enumerate(salary_filtered_frames):
    print(len(frame))
    if index == 0:
        merged_frame = frame
    else:   
        new_frame = merged_frame.merge(frame, how="outer")
        merged_frame = new_frame
merged_frame.reset_index()
final_frame = merged_frame.drop_duplicates(subset=['id'])
final_frame.reset_index()

In [67]:
print(len(final_frame))
print(final_frame.keys())


178
Index(['company', 'description', 'descriptionHTML', 'externalApplyLink', 'id',
       'jobType', 'jobType/0', 'jobType/1', 'jobType/2', 'location',
       'positionName', 'postedAt', 'postingDateParsed', 'rating',
       'reviewsCount', 'salary', 'scrapedAt', 'searchInput/country',
       'searchInput/location', 'searchInput/position', 'url'],
      dtype='object')


In [63]:
from langchain.llms import Bedrock
from langchain.prompts import PromptTemplate
claude_v1_model_id = 'anthropic.claude-instant-v1'
claude_v2_model_id = 'anthropic.claude-v2'
llm = Bedrock(model_id=claude_v1_model_id, model_kwargs={'max_tokens_to_sample':8000})
#output = llm.predict("How are you today?")


In [64]:
template = PromptTemplate.from_template("""
Extract specific skills and responsibilities from the following job description: {job}
""")


In [30]:
huge_file_read = open("job_summaries_massive.txt", "r", encoding="utf-8")
huge_file_text = huge_file_read.read()
template = PromptTemplate.from_template("""
Act as an expert formatter. You format based on the given format. Skip the preamble.
I will provide you with job skills and responsibilities text for Data Engineering, output the top 5 skills AND technologies in the following format: <topskills>1. Skill 2. Skill ...</topskills> <toptech>1. Tech 2. Tech .... </toptech>.
<dataengineeringtext>{text}</dataengineeringtext>
""")
prompt = template.format(text=huge_file_text)
output = llm.predict(prompt)
print(output)


 Here are the top 5 skills and top technologies extracted from the job description:

<topskills>
1. Databricks/PySpark
2. AWS
3. Cloud experience  
4. Software engineering 
5. Typescript
</topskills>

<toptech>
1. AWS
2. Cloud 
3. Databricks
4. PySpark
5. Typescript
</toptech>


In [66]:
# Refactored example, with uniqued dataset filtered on JobID
path = 'skills-res-v4-unique-day-3/'
file_name = 'claude_v1_df_final_row_'
for index, row in final_frame.iterrows():
    description = row['description']
    file = open(f"{path}{file_name}{index}.txt", "w", encoding="utf-8")
    prompt = template.format(job=description)
    output = llm.predict(prompt)
    file.write(output)
    file.close()

In [68]:
# Append the link to the job posting 
path = 'skills-res-v4-unique-day-3/'
file_name = 'claude_v1_df_final_row_'
for index, row in final_frame.iterrows():
    url = row['url']
    file = open(f"{path}{file_name}{index}.txt", "a", encoding="utf-8")
    #prompt = template.format(job=description)
    #output = llm.predict(prompt)
    file.write(f"\n\n{url}")
    file.close()

In [75]:
# Combine all of the documents
docs = os.listdir(path)
combined_day_3 = ""
for doc in docs:
    doc_file = open(path+doc, "r", encoding="utf-8")
    doc_text = doc_file.read()
    combined_day_3 += f"\n\n{doc_text}"
    doc_file.close()
    

In [84]:
# Attempt to gain some insight from AI
template = PromptTemplate.from_template("""
Act as an expert formatter. You format based on the given format. Skip the preamble.
I will provide you with a combined list of job skills and responsibilities text for Data Engineering taken from multiple job postings, output the top 5 skills AND technologies that appear most often across the entire text, do not include skills such as Data Engineering or ETL processes, be specific in the following format: <topskills>1. Skill 2. Skill ...</topskills> <toptech>1. Tech 2. Tech .... </toptech>.
<dataengineeringtext>{text}</dataengineeringtext>
""")
prompt = template.format(text=combined_day_3)
llm.model_id = claude_v2_model_id
output = llm.predict(prompt)

In [85]:
print(output)

 Here are the top 5 skills and technologies extracted from the data engineering job description text:

<topskills>
1. SQL
2. Python
3. ETL processes
4. Data modeling 
5. Data warehousing
</topskills>

<toptech>  
1. AWS services (S3, Redshift, Glue, etc)
2. Databricks
3. Snowflake
4. Azure services (Azure Data Factory, Azure Databricks, etc)
5. Apache Spark
</toptech>


In [86]:
output_final_frame = final_frame.to_csv("combined_10_21.csv")