In [None]:
%pip install boto3 pandas s3fs

In [22]:
import io
import os
import boto3
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

AWS_S3_BUCKET = os.getenv('AWS_S3_BUCKET')
APIFY_TOKEN = os.getenv('APIFY_TOKEN')
s3_client = boto3.client("s3")
print(AWS_S3_BUCKET)

raw-apify-datasets-lake


In [92]:
response = s3_client.list_objects(Bucket=AWS_S3_BUCKET, Prefix="bronze/raw_indeed_jobs/dataset")
contents = response["Contents"]
datasets = []
for item in contents:
    datasets.append(item["Key"])
    print(item["Key"])
print(datasets)

bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-18_13-39-59-697.csv
bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-19_14-08-42-423.csv
bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-20_12-42-20-486.csv
bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-21_13-12-16-991.csv
['bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-18_13-39-59-697.csv', 'bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-19_14-08-42-423.csv', 'bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-20_12-42-20-486.csv', 'bronze/raw_indeed_jobs/dataset_indeed-scraper_2023-10-21_13-12-16-991.csv']


In [7]:
# Get the data from bronze bucket and store it in array to be worked on
dataframes = []
for data in datasets:
    job_object = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=data)
    status = job_object.get("ResponseMetadata", {}).get("HTTPStatusCode")
    jobs_df = pd.core.frame.DataFrame
    if status == 200:
        print("Successfully got desired object")
        jobs_df = pd.read_csv(job_object.get("Body"))
        dataframes.append(jobs_df)
    else:
        print("Unable to get object, something wrong")



Successfully got desired object
Successfully got desired object
Successfully got desired object
Successfully got desired object


In [9]:
salary_filtered_frames = []
for frame in dataframes:
    filtered_salary = frame['salary'].notnull()
    filtered_curr = frame[filtered_salary]
    salary_filtered_frames.append(filtered_curr)
len(salary_filtered_frames)

4

In [None]:
# Combine the dataframes
# Final frame should represent correct number of uniques
merged_frame = pd.DataFrame
for index, frame in enumerate(salary_filtered_frames):
    print(len(frame))
    if index == 0:
        merged_frame = frame
    else:   
        new_frame = merged_frame.merge(frame, how="outer")
        merged_frame = new_frame
merged_frame.reset_index()
final_frame = merged_frame.drop_duplicates(subset=['id'])
final_frame.reset_index()

In [67]:
print(len(final_frame))
print(final_frame.keys())


178
Index(['company', 'description', 'descriptionHTML', 'externalApplyLink', 'id',
       'jobType', 'jobType/0', 'jobType/1', 'jobType/2', 'location',
       'positionName', 'postedAt', 'postingDateParsed', 'rating',
       'reviewsCount', 'salary', 'scrapedAt', 'searchInput/country',
       'searchInput/location', 'searchInput/position', 'url'],
      dtype='object')


In [63]:
from langchain.llms import Bedrock
from langchain.prompts import PromptTemplate
claude_v1_model_id = 'anthropic.claude-instant-v1'
claude_v2_model_id = 'anthropic.claude-v2'
llm = Bedrock(model_id=claude_v1_model_id, model_kwargs={'max_tokens_to_sample':8000})
#output = llm.predict("How are you today?")


In [64]:
template = PromptTemplate.from_template("""
Extract specific skills and responsibilities from the following job description: {job}
""")


In [30]:
huge_file_read = open("job_summaries_massive.txt", "r", encoding="utf-8")
huge_file_text = huge_file_read.read()
template = PromptTemplate.from_template("""
Act as an expert formatter. You format based on the given format. Skip the preamble.
I will provide you with job skills and responsibilities text for Data Engineering, output the top 5 skills AND technologies in the following format: <topskills>1. Skill 2. Skill ...</topskills> <toptech>1. Tech 2. Tech .... </toptech>.
<dataengineeringtext>{text}</dataengineeringtext>
""")
prompt = template.format(text=huge_file_text)
output = llm.predict(prompt)
print(output)


 Here are the top 5 skills and top technologies extracted from the job description:

<topskills>
1. Databricks/PySpark
2. AWS
3. Cloud experience  
4. Software engineering 
5. Typescript
</topskills>

<toptech>
1. AWS
2. Cloud 
3. Databricks
4. PySpark
5. Typescript
</toptech>


In [66]:
# Refactored example, with uniqued dataset filtered on JobID
path = 'skills-res-v4-unique-day-3/'
file_name = 'claude_v1_df_final_row_'
for index, row in final_frame.iterrows():
    description = row['description']
    file = open(f"{path}{file_name}{index}.txt", "w", encoding="utf-8")
    prompt = template.format(job=description)
    output = llm.predict(prompt)
    file.write(output)
    file.close()

In [68]:
# Append the link to the job posting 
path = 'skills-res-v4-unique-day-3/'
file_name = 'claude_v1_df_final_row_'
for index, row in final_frame.iterrows():
    url = row['url']
    file = open(f"{path}{file_name}{index}.txt", "a", encoding="utf-8")
    #prompt = template.format(job=description)
    #output = llm.predict(prompt)
    file.write(f"\n\n{url}")
    file.close()

In [75]:
# Combine all of the documents
docs = os.listdir(path)
combined_day_3 = ""
for doc in docs:
    doc_file = open(path+doc, "r", encoding="utf-8")
    doc_text = doc_file.read()
    combined_day_3 += f"\n\n{doc_text}"
    doc_file.close()
    

In [84]:
# Attempt to gain some insight from AI
template = PromptTemplate.from_template("""
Act as an expert formatter. You format based on the given format. Skip the preamble.
I will provide you with a combined list of job skills and responsibilities text for Data Engineering taken from multiple job postings, output the top 5 skills AND technologies that appear most often across the entire text, do not include skills such as Data Engineering or ETL processes, be specific in the following format: <topskills>1. Skill 2. Skill ...</topskills> <toptech>1. Tech 2. Tech .... </toptech>.
<dataengineeringtext>{text}</dataengineeringtext>
""")
prompt = template.format(text=combined_day_3)
llm.model_id = claude_v2_model_id
output = llm.predict(prompt)

In [85]:
print(output)

 Here are the top 5 skills and technologies extracted from the data engineering job description text:

<topskills>
1. SQL
2. Python
3. ETL processes
4. Data modeling 
5. Data warehousing
</topskills>

<toptech>  
1. AWS services (S3, Redshift, Glue, etc)
2. Databricks
3. Snowflake
4. Azure services (Azure Data Factory, Azure Databricks, etc)
5. Apache Spark
</toptech>


In [86]:
output_final_frame = final_frame.to_csv("combined_10_21.csv")

In [1]:
# Bring in Apify and start automated process
!pip install Apify

Defaulting to user installation because normal site-packages is not writeable
Collecting Apify
  Downloading apify-1.2.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 KB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting sortedcollections>=2.0.1
  Downloading sortedcollections-2.1.0-py3-none-any.whl (9.5 kB)
Collecting apify-client~=1.5.0
  Downloading apify_client-1.5.0-py3-none-any.whl (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.2/68.2 KB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting colorama>=0.4.6
  Using cached colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting pyee>=11.0.1
  Downloading pyee-11.0.1-py3-none-any.whl (15 kB)
Collecting apify-shared~=1.0.4
  Downloading apify_shared-1.0.4-py3-none-any.whl (12 kB)
Collecting aioshutil>=1.0
  Downloading aioshutil-1.3-py3-none-any.whl (4.6 kB)
Installing collected packages: sortedcollections, pyee, colorama, apify

In [44]:
#print(data)
def get_indeed_datasets():
    datasets = []
    for item in data['data']['items']:
        if item['schema']:
            schema = json.loads(item['schema'])
            title = schema['title']
            if title == "Indeed Scraper":
                datasets.append(item)
    return datasets

from apify_client import ApifyClient

client = ApifyClient(APIFY_TOKEN)

run_input = {
    "position": "data engineer",
    "country": "US",
    "location": "remote",
    "maxItems": 50,
    "parseCompanyDetails": False,
    "saveOnlyUniqueItems": True,
    "followApplyRedirects": True,
    "maxConcurrency": 5,
}

actor_call = client.actor('misceres/indeed-scraper').call(run_input=run_input)


def get_datasets(client):
    data_collection_client = client.datasets()
    listed_sets = data_collection_client.list(unnamed=True, desc=True)
    print(listed_sets.items)

#get_datasets(client)   
    

In [45]:
dataset_items = client.dataset(actor_call['defaultDatasetId']).download_items(item_format="csv")

In [65]:
from datetime import datetime, timezone

# Convert the timezone to local
tz = actor_call['finishedAt'].replace(tzinfo=timezone.utc).astimezone(tz=None)
formatted_date = tz.strftime("%Y-%m-%d_%H-%M-%S")
# Create filename based on when the scraper finished
file_name = "dataset_indeed-scraper_"+formatted_date+".csv"
print(file_name)

dataset_indeed-scraper_2023-10-23_07-07-15.csv


In [66]:
print(dataset_items.decode('utf-8'))

"company","description","descriptionHTML","externalApplyLink","id","jobType","jobType/0","jobType/1","location","positionName","postedAt","postingDateParsed","rating","reviewsCount","salary","scrapedAt","searchInput/country","searchInput/location","searchInput/position","url"
"Microagility","We are seeking an experienced ETL (Extract, Transform, and Load) Data Engineer with expertise in Google Cloud Platform (GCP) to join our client data engineering team.
Key Responsibilities:
· Design, develop, and maintain ETL pipelines on Google Cloud Platform (GCP) to ensure efficient data extraction, transformation, and loading processes.
· Extract data from various sources, including databases, APIs, and cloud storage, and ensure data quality and consistency.
· Collaborate with data scientists, data analysts, and other stakeholders to understand their data requirements and ensure data pipelines meet their needs.
· Implement data transformations, including cleaning, aggregation, and enrichment, to

In [59]:
for item in dataset_items:
    print(item)

239
187
191
34
99
111
109
112
97
110
121
34
44
34
100
101
115
99
114
105
112
116
105
111
110
34
44
34
100
101
115
99
114
105
112
116
105
111
110
72
84
77
76
34
44
34
101
120
116
101
114
110
97
108
65
112
112
108
121
76
105
110
107
34
44
34
105
100
34
44
34
106
111
98
84
121
112
101
34
44
34
106
111
98
84
121
112
101
47
48
34
44
34
106
111
98
84
121
112
101
47
49
34
44
34
108
111
99
97
116
105
111
110
34
44
34
112
111
115
105
116
105
111
110
78
97
109
101
34
44
34
112
111
115
116
101
100
65
116
34
44
34
112
111
115
116
105
110
103
68
97
116
101
80
97
114
115
101
100
34
44
34
114
97
116
105
110
103
34
44
34
114
101
118
105
101
119
115
67
111
117
110
116
34
44
34
115
97
108
97
114
121
34
44
34
115
99
114
97
112
101
100
65
116
34
44
34
115
101
97
114
99
104
73
110
112
117
116
47
99
111
117
110
116
114
121
34
44
34
115
101
97
114
99
104
73
110
112
117
116
47
108
111
99
97
116
105
111
110
34
44
34
115
101
97
114
99
104
73
110
112
117
116
47
112
111
115
105
116
105
111
110
34
44
34
117
114
10

In [67]:
import csv
# Write to the csv
new_data = dataset_items.decode('utf-8')
file = open(file_name, "w", newline='')
file.write(new_data)

511160

In [68]:
new_df = pd.read_csv(file_name)
new_df

Unnamed: 0,company,description,descriptionHTML,externalApplyLink,id,jobType,jobType/0,jobType/1,location,positionName,postedAt,postingDateParsed,rating,reviewsCount,salary,scrapedAt,searchInput/country,searchInput/location,searchInput/position,url
0,Microagility,"We are seeking an experienced ETL (Extract, Tr...","<p>We are seeking an experienced ETL (Extract,...",,e981504596ce7e00,,Contract,,Remote,Data Engineer GCP ETL,2 days ago,2023-10-21T13:05:25.133Z,,,,2023-10-23T13:05:25.825Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=e981504596ce7...
1,InfoMagnus,Data Governance Engineer\nThis is a contract r...,<p><b>Data Governance Engineer</b></p>\n<p><i>...,,19d5fafc9fb74494,,Contract,,Remote,Data Governance Engineer,1 day ago,2023-10-22T13:05:28.320Z,,,$60 - $70 an hour,2023-10-23T13:05:28.324Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=19d5fafc9fb74...
2,Mayo Clinic,Why Mayo Clinic \n \n \n Mayo Clinic is top...,<div>\n <div>\n <b>Why Mayo Clinic</b> \n </d...,https://www.indeed.com/rc/clk?jk=90ca40677b46e...,90ca40677b46e6ea,,Full-time,,"3636 Technology Dr NW, Rochester, MN 55901",IT Lead Data Engineer - Remote,1 day ago,2023-10-22T13:05:24.334Z,3.9,3032.0,"$138,237 - $200,408 a year",2023-10-23T13:05:25.038Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=90ca40677b46e...
3,General Dynamics Information Technology,Clearance Level None Category Data Science Loc...,Clearance Level None Category Data Science Loc...,https://www.indeed.com/rc/clk?jk=b313953a37e91...,b313953a37e916a3,,,,"Washington, DC",TSS Data Engineer,1 day ago,2023-10-22T13:05:26.016Z,3.7,5652.0,,2023-10-23T13:05:26.017Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=b313953a37e91...
4,Vision Government Solutions Inc,About Vision\n Vision Government Solutions is...,<div>\n <p><b>About Vision</b></p>\n <p> Visio...,https://www.indeed.com/rc/clk?jk=8a288fca6ecd6...,8a288fca6ecd6019,,Full-time,,Remote,Data Engineer,2 days ago,2023-10-21T13:05:31.115Z,3.4,13.0,,2023-10-23T13:05:31.117Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=8a288fca6ecd6...
5,ASCENDING,Location: 100% Remote (Driving distance to Ash...,<div>\n <p>Location: <b>100% Remote</b> (Drivi...,https://www.indeed.com/rc/clk?jk=282a7c727e0fa...,282a7c727e0fad73,,Full-time,,"Ashburn, VA",Azure Data Engineer,2 days ago,2023-10-21T13:05:32.085Z,,,,2023-10-23T13:05:32.086Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=282a7c727e0fa...
6,KinderCare Education,"Futures start here. Where first steps, new fri...",<div>\n <p>Futures start here. Where first ste...,https://www.indeed.com/rc/clk?jk=707d3fdd870a2...,707d3fdd870a23bd,,Full-time,,"Portland, OR 97204",Data Engineer - Remote Opportunity,2 days ago,2023-10-21T13:05:33.118Z,2.7,768.0,,2023-10-23T13:05:33.120Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=707d3fdd870a2...
7,Nuna,"At Nuna, our mission is to make high-quality h...","<div>\n <div>\n <p><i>At Nuna, our mission is...",https://www.indeed.com/rc/clk?jk=7430eb6847973...,7430eb68479730fd,,,,"San Francisco, CA","Staff Software Engineer, Data Integration",2 days ago,2023-10-21T13:05:34.419Z,,,"$165,000 - $230,000 a year",2023-10-23T13:05:34.421Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=7430eb6847973...
8,Intellipro Group Inc,Duties\nThe Strategy & Operations (SS&O) team ...,<p><b>Duties</b></p>\n<p>The Strategy &amp; Op...,,095f064bfba48308,,Full-time,Contract,Remote,Data Engineer,2 days ago,2023-10-21T13:05:41.032Z,,,$70 an hour,2023-10-23T13:05:41.034Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=095f064bfba48...
9,Concentrix,Job Title: Big Data Engineer Job Ref #: 981496...,<p></p>\n<div>\n <p>Job Title:</p> Big Data En...,https://www.indeed.com/rc/clk?jk=2ba6006821dec...,2ba6006821dec6ec,,,,Remote,Big Data Engineer Job Ref #: 981496,2 days ago,2023-10-21T13:05:40.775Z,3.4,31685.0,,2023-10-23T13:05:40.776Z,US,remote,data engineer,https://www.indeed.com/rc/clk?jk=2ba6006821dec...


In [69]:
s3_filename = f"bronze/raw_indeed_jobs/"+file_name
s3_client.upload_file(file_name, AWS_S3_BUCKET, s3_filename)