## Extract

Extract the data

In [81]:
import boto3
import pandas as pd
from io import StringIO

# create s3 client
s3 = boto3.client('s3')

# specify the bucket & folder you'd like to interact with
# will be using the cyber data as an example
name = "data-analyst-job-east"
file = "raw/gsearch_jobs.csv"

single_object = s3.get_object(Bucket=name, Key=file)
single_df = pd.read_csv(single_object['Body'])

In [82]:
single_df.head()

Unnamed: 0.1,Unnamed: 0,index,title,company_name,location,via,description,extensions,job_id,thumbnail,...,commute_time,salary_pay,salary_rate,salary_avg,salary_min,salary_max,salary_hourly,salary_yearly,salary_standardized,description_tokens
0,0,0,Data Analyst,Chloeta,"Oklahoma City, OK",via ZipRecruiter,Job Summary: The Data Analyst oversees data pr...,"['21 hours ago', 'Full-time', 'Health insuranc...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,,...,,,,,,,,,,"['python', 'r']"
1,1,1,Junior Data Analyst/Scientist Role - Contract ...,Upwork,Anywhere,via Upwork,"Company\n\nThe TAC Index provides independent,...","['17 hours ago', 'Work from home', 'Contractor...",eyJqb2JfdGl0bGUiOiJKdW5pb3IgRGF0YSBBbmFseXN0L1...,,...,,,,,,,,,,"['matlab', 'matplotlib', 'postgresql', 'mongod..."
2,2,2,Data Analyst,ATC,United States,via LinkedIn,Job Title: Entry Level Business Analyst / Prod...,"['12 hours ago', 'Full-time', 'Health insurance']",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,[]
3,3,3,Data Analyst,Guidehouse,"Topeka, KS",via Nexxt,Job Family :\n\nData Science & Analysis (Digit...,"['10 hours ago', 'Full-time', 'Health insuranc...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,"['powerpoint', 'perl', 'python', 'qlik', 'tabl..."
4,4,4,Data Analyst,AnMed Health LLC,Anywhere,via LinkedIn,"AnMed is a dynamic, comprehensive health syste...","['18 hours ago', 'Work from home', 'Part-time'...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,,...,,,,,,,,,,[]


In [83]:
single_df['company_name'].value_counts()


company_name
Upwork                             6934
Talentify.io                       1683
Walmart                            1537
EDWARD JONES                        747
Dice                                691
                                   ... 
Fisher Financial  Advisors, LLC       1
e-Emphasys Technologies               1
The Marlin Alliance, Inc.             1
Climate People                        1
Techdash Telecom                      1
Name: count, Length: 11019, dtype: int64

## Transform

In [84]:
# 1) Drop staffing agencies (keywords to Staffing, Hire, Global, etc)

filtered_df = single_df[~single_df['company_name'].isin(['Upwork', 'Talentify.io'])]

company_counts = filtered_df['company_name'].value_counts()


In [85]:
filtered_df['company_name'].value_counts()

company_name
Walmart               1537
EDWARD JONES           747
Dice                   691
Corporate              612
Cox Communications     538
                      ... 
IDme                     1
Teads                    1
Jkbarnes                 1
New Globe Inc            1
Techdash Telecom         1
Name: count, Length: 11017, dtype: int64

In [86]:
filtered_df.columns

Index(['Unnamed: 0', 'index', 'title', 'company_name', 'location', 'via',
       'description', 'extensions', 'job_id', 'thumbnail', 'posted_at',
       'schedule_type', 'work_from_home', 'salary', 'search_term', 'date_time',
       'search_location', 'commute_time', 'salary_pay', 'salary_rate',
       'salary_avg', 'salary_min', 'salary_max', 'salary_hourly',
       'salary_yearly', 'salary_standardized', 'description_tokens'],
      dtype='object')

In [87]:
# 2) Most important columns: description_token, salary_standardized, title, date_time   (drop everything else)

selected_columns_df = filtered_df[['description_tokens', 'salary_standardized', 'title', 'date_time','company_name']]
selected_columns_df

Unnamed: 0,description_tokens,salary_standardized,title,date_time,company_name
0,"['python', 'r']",,Data Analyst,2023-08-03 03:00:09.849838,Chloeta
2,[],,Data Analyst,2023-08-03 03:00:09.849838,ATC
3,"['powerpoint', 'perl', 'python', 'qlik', 'tabl...",,Data Analyst,2023-08-03 03:00:09.849838,Guidehouse
4,[],,Data Analyst,2023-08-03 03:00:09.849838,AnMed Health LLC
5,[],,Research Data Analyst 1,2023-08-03 03:00:09.849838,Oregon Health & Science University
...,...,...,...,...,...
48392,"['python', 'sql', 'tableau', 'snowflake', 'r',...",103781.0,Marketing Data & BI Analyst II,2022-11-04 03:40:23.706734,EDWARD JONES
48393,[],144481.5,Lead-Data Analyst,2022-11-24 04:00:08.710801,EDWARD JONES
48394,[],144481.5,Lead-Data Analyst,2022-12-07 04:00:12.563831,EDWARD JONES
48395,[],144481.5,Lead-Data Analyst,2022-12-08 04:00:15.975728,EDWARD JONES


In [88]:
# 3) drop 39,000 missing rows of salary_standardized

selected_columns_df.isnull().sum() 


description_tokens         0
salary_standardized    34845
title                      0
date_time                  0
company_name               0
dtype: int64

In [89]:
selected_columns_df = selected_columns_df.dropna(subset=['salary_standardized'])
selected_columns_df

Unnamed: 0,description_tokens,salary_standardized,title,date_time,company_name
19,"['pl/sql', 'sql', 'tableau']",94640.0,"Data Analyst Report Writer (Level 2) _ Austin,...",2023-08-03 03:00:11.064921,OQ Point LLC
29,[],48391.2,Associate Research/Data Analyst-CES - Now Hiring,2023-08-03 03:00:13.367401,Saint Louis County Clerks Office
96,"['python', 'sql', 'r']",86320.0,Bioinformatics Analyst (NGS/OMICS/SQL/Curation...,2023-08-03 03:01:00.649092,Rangam
98,"['python', 'sql', 'r', 'tableau']",122000.0,Data Analyst,2023-08-04 03:00:13.797776,Meta
102,"['powerpoint', 'word', 'excel', 'outlook']",100000.0,Data Analyst | Workforce Management,2023-08-04 03:00:13.797776,Krispy Kreme
...,...,...,...,...,...
48392,"['python', 'sql', 'tableau', 'snowflake', 'r',...",103781.0,Marketing Data & BI Analyst II,2022-11-04 03:40:23.706734,EDWARD JONES
48393,[],144481.5,Lead-Data Analyst,2022-11-24 04:00:08.710801,EDWARD JONES
48394,[],144481.5,Lead-Data Analyst,2022-12-07 04:00:12.563831,EDWARD JONES
48395,[],144481.5,Lead-Data Analyst,2022-12-08 04:00:15.975728,EDWARD JONES


In [90]:
# 4) join in the missing data


In [99]:
import boto3
import pandas as pd
from io import StringIO

# create s3 client
s3 = boto3.client('s3')

# specify the bucket & folder you'd like to interact with
# will be using the cyber data as an example
name = "data-analyst-job-east"
file = "raw/LinkedIn_Clean_List.csv"

Second_object = s3.get_object(Bucket=name, Key=file)
Second_df = pd.read_csv(Second_object['Body'])

In [100]:
Second_df.head()

Unnamed: 0,Title,Company,Posted,Applicants,Type,Salary,Data Science,Machine Learning,Data Analyst
0,data scientist small business group,Meta,1 day ago,77,Not Applicable,169000.0,True,False,False
1,data scientist analytics,DoorDash,1 day ago,0,Entry level,188000.0,True,False,False
2,data scientist,Walgreens,1 day ago,170,Not Applicable,119324.07,True,False,False
3,jr data scientist,EVONA,1 day ago,0,Entry level,110000.0,True,False,False
4,data science intern,ActiGraph,6 days ago,0,Internship,39088.4,True,False,False


In [101]:
Second_df = Second_df.rename(columns={'Company': 'company_name'})

In [103]:
Second_df = Second_df.rename(columns={'Title': 'title'})

In [None]:
#https://medium.com/codex/data-cleaning-using-pandas-c65e20a84c4b


In [104]:
Second_df

Unnamed: 0,title,company_name,Posted,Applicants,Type,Salary,Data Science,Machine Learning,Data Analyst
0,data scientist small business group,Meta,1 day ago,77,Not Applicable,169000.00,True,False,False
1,data scientist analytics,DoorDash,1 day ago,0,Entry level,188000.00,True,False,False
2,data scientist,Walgreens,1 day ago,170,Not Applicable,119324.07,True,False,False
3,jr data scientist,EVONA,1 day ago,0,Entry level,110000.00,True,False,False
4,data science intern,ActiGraph,6 days ago,0,Internship,39088.40,True,False,False
...,...,...,...,...,...,...,...,...,...
219,safety analyst,VDart,1 day ago,0,Mid-Senior level,118502.52,False,False,False
220,staff data scientist,Walmart,5 days ago,0,Entry level,214500.00,True,False,False
221,fraud analyst,Newegg,1 day ago,37,Mid-Senior level,118502.52,False,False,False
222,ai software engineer,Zoom,1 day ago,0,Not Applicable,119324.07,False,True,False


In [None]:
#Second_df = Second_df.rename(columns={'average_salary': 'salary_standardized'})

In [None]:

Second_df = filtered_df[['title', 'company_name']]
Second_df

Unnamed: 0,title,company_name
0,Data Analyst,Chloeta
2,Data Analyst,ATC
3,Data Analyst,Guidehouse
4,Data Analyst,AnMed Health LLC
5,Research Data Analyst 1,Oregon Health & Science University
...,...,...
48392,Marketing Data & BI Analyst II,EDWARD JONES
48393,Lead-Data Analyst,EDWARD JONES
48394,Lead-Data Analyst,EDWARD JONES
48395,Lead-Data Analyst,EDWARD JONES


In [None]:
# Combine the dataframes
combined_df = pd.concat([selected_columns_df, Second_df], ignore_index=True)


In [None]:
combined_df

Unnamed: 0,description_tokens,salary_standardized,title,date_time,company_name
0,"['pl/sql', 'sql', 'tableau']",94640.0,"Data Analyst Report Writer (Level 2) _ Austin,...",2023-08-03 03:00:11.064921,OQ Point LLC
1,[],48391.2,Associate Research/Data Analyst-CES - Now Hiring,2023-08-03 03:00:13.367401,Saint Louis County Clerks Office
2,"['python', 'sql', 'r']",86320.0,Bioinformatics Analyst (NGS/OMICS/SQL/Curation...,2023-08-03 03:01:00.649092,Rangam
3,"['python', 'sql', 'r', 'tableau']",122000.0,Data Analyst,2023-08-04 03:00:13.797776,Meta
4,"['powerpoint', 'word', 'excel', 'outlook']",100000.0,Data Analyst | Workforce Management,2023-08-04 03:00:13.797776,Krispy Kreme
...,...,...,...,...,...
44710,,,Marketing Data & BI Analyst II,,EDWARD JONES
44711,,,Lead-Data Analyst,,EDWARD JONES
44712,,,Lead-Data Analyst,,EDWARD JONES
44713,,,Lead-Data Analyst,,EDWARD JONES


In [None]:
# I still need to EDA Analysis for the Combined data

In [None]:
combined_df.to_csv('output.csv', index=False)

## Load

In [None]:
# 5) Load data back into s3 bucket
import boto3

# open client
client = boto3.client('s3')

# some initial variables
bucket_name = "data-analyst-job-east"
object_key = "transformed/combined_data.csv"

# open the file in binary format, and save into the var 'data'
with open("output.csv", "rb") as f:
    data = f.read()

### ADDING OBJECTS TO A BUCKET ###
response = client.put_object(
    Body=data,
    Bucket=bucket_name,
    Key=object_key
)