In [23]:
import pandas as pd
import numpy as np

# Load Data

In [24]:
# Load Data as Pandas Dataframe
df = pd.read_csv('job_postings.csv')

# Drop Irrelevant Columns
df = df.drop(columns=['Job Title Full', 'Job Title Additional Info', 'Minimum Pay', 'Maximum Pay', 'Pay Rate', 'Job Posting ID'])

display(df.head())
df.info()

Unnamed: 0,Job Posting Date,Job Title,Job Position Type,Job Position Level,Years of Experience,Job Skills,Job Location,Number of Applicants,Company Name,Company Industry,Company Size
0,2017-01-01,Software Engineer,Full-time,Entry level,1,"database, javascript, agile, linux, server, no...",United States,6.0,"Cardinal Financial Company, Limited Partnership",Financial Services,"1,001-5,000 employees"
1,2017-01-01,Data Engineer,Full-time,Mid-Senior level,2,"data_lake, cloud, python, spark, github, wareh...",United States,1.0,Brinks Home,Consumer Electronics,"1,001-5,000 employees"
2,2017-01-01,Software Engineer,Full-time,Entry level,5,"mongo, oracle, microsoft, css, javascript, htm...",United States,16.0,Paycor,Computer Software,"1,001-5,000 employees"
3,2017-01-01,Business Analyst,Full-time,Entry level,2,"agile, excel","Phoenix, AZ",1.0,Optum,Hospital & Health Care,"10,001+ employees"
4,2017-01-01,Developer,Contract,Mid-Senior level,4,"excel, back-end, ios, swift, programming","Richmond, CA",,Toptal,Internet,"1,001-5,000 employees"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25114 entries, 0 to 25113
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Job Posting Date      25114 non-null  object 
 1   Job Title             25114 non-null  object 
 2   Job Position Type     25114 non-null  object 
 3   Job Position Level    25114 non-null  object 
 4   Years of Experience   25114 non-null  int64  
 5   Job Skills            22904 non-null  object 
 6   Job Location          25114 non-null  object 
 7   Number of Applicants  17529 non-null  float64
 8   Company Name          25053 non-null  object 
 9   Company Industry      24895 non-null  object 
 10  Company Size          24892 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 2.1+ MB


# Data Cleaning

In [25]:
# Change Job Posting Date Datatype
df['Job Posting Date'] = df['Job Posting Date'].astype('datetime64[ms]')

In [26]:
# Create Job Category Column
keywords = ['Analyst', 'Data Scientist', 'Engineer']
conditions = [df['Job Title'].str.contains(k, case=False, na=False) for k in keywords]

df['Job Category'] = np.select(conditions, keywords, default='Other')

# Check
df.head(10)

Unnamed: 0,Job Posting Date,Job Title,Job Position Type,Job Position Level,Years of Experience,Job Skills,Job Location,Number of Applicants,Company Name,Company Industry,Company Size,Job Category
0,2017-01-01,Software Engineer,Full-time,Entry level,1,"database, javascript, agile, linux, server, no...",United States,6.0,"Cardinal Financial Company, Limited Partnership",Financial Services,"1,001-5,000 employees",Engineer
1,2017-01-01,Data Engineer,Full-time,Mid-Senior level,2,"data_lake, cloud, python, spark, github, wareh...",United States,1.0,Brinks Home,Consumer Electronics,"1,001-5,000 employees",Engineer
2,2017-01-01,Software Engineer,Full-time,Entry level,5,"mongo, oracle, microsoft, css, javascript, htm...",United States,16.0,Paycor,Computer Software,"1,001-5,000 employees",Engineer
3,2017-01-01,Business Analyst,Full-time,Entry level,2,"agile, excel","Phoenix, AZ",1.0,Optum,Hospital & Health Care,"10,001+ employees",Analyst
4,2017-01-01,Developer,Contract,Mid-Senior level,4,"excel, back-end, ios, swift, programming","Richmond, CA",,Toptal,Internet,"1,001-5,000 employees",Other
5,2017-01-01,Data Engineer,Full-time,Mid-Senior level,6,"data_lake, cloud, data_lakes, python, hadoop, ...","Los Angeles, CA",,ClearScale,Information Technology & Services,51-200 employees,Engineer
6,2017-01-01,Software Developer,Full-time,Mid-Senior level,4,"database, pl/sql, cloud, javascript, agile, no...","Rochester, NY",9.0,Ellucian,Higher Education,"1,001-5,000 employees",Other
7,2017-01-01,Data Analyst,Full-time,Associate,5,"data_lake, database, python, hadoop, ibm, orac...","Charleston, SC",25.0,Perficient,Information Technology & Services,"1,001-5,000 employees",Analyst
8,2017-01-01,Data Engineer,Full-time,Mid-Senior level,9,"python, programming, etl, postgresql, linux, s...","Baltimore, MD",5.0,Medix Technology,Information Technology & Services,201-500 employees,Engineer
9,2017-01-02,Business Intelligence Analyst,Contract,Mid-Senior level,4,"r, scala, powershell, c++, java, python, table...","Raleigh, NC",100.0,OnDemand Agility Solutions,Information Technology & Services,"1,001-5,000 employees",Analyst


In [None]:
# Split Job Location into City/State
df[['City', 'State']] = df['Job Location'].str.split(',', n=1, expand=True)

In [33]:
# Fill Nulls for Number of Applicants
df['Number of Applicants'] = df['Number of Applicants'].fillna(0)

In [36]:
# Copy dataframe 
df_skills = df.copy()

# Split string of skills into a list to prepare for explode
df_skills['Skills'] = df['Job Skills'].str.split(',')

# Explode the list into separate rows
df_skills = df_skills.explode('Skills')

# Remove whitespaces
df_skills['Skills'] = df_skills['Skills'].str.strip()

# Keep Skills column only
df_skills = df_skills[['Skills']]

df_skills

Unnamed: 0,Skills
0,database
0,javascript
0,agile
0,linux
0,server
...,...
25113,programming
25113,etl
25113,no-sql
25113,snowflake


In [39]:
df_skills.to_csv('job_skills.csv', index=True)

In [40]:
df.to_csv('job_postings_clean.csv', index=True)