In [40]:
import re
import os
from glob import glob

import matplotlib.pyplot as plt
import nltk
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from tqdm import tqdm

plt.style.use('fivethirtyeight') 
pd.set_option("display.max_columns", None)

In [2]:
# general settings
class CFG:
    img_dim1 = 20
    img_dim2 = 10
    
# adjust the parameters for displayed figures    
plt.rcParams.update({'figure.figsize': (CFG.img_dim1,CFG.img_dim2)})    

In [3]:
# list of JDs of companies
files = glob("data\scraped_jds\*.csv")
print(len(files))

56


In [4]:
df = pd.concat([pd.read_csv(file) for file in tqdm(files)])\
    [["Employer",
    "First Seen Date",
    "Title (Job Title)",
    "Occupation Name",
    "City",
    "State",
    "Job_Type",
    "Education",
    "Description"
    ]]\
    .drop_duplicates("Description", keep='first')\
    .rename(columns={"Title (Job Title)": "Title", "index": "Job ID"})

df = df[df.Description.notnull()]\
    .reset_index(drop=True)\
    .reset_index()\
    .rename(columns={"index": "Job ID"})
df["Job ID"] = df["Job ID"] + 1
df["First Seen Date"] = pd.to_datetime(df["First Seen Date"])
print(df.shape)
df.head()

  0%|          | 0/56 [00:00<?, ?it/s]

100%|██████████| 56/56 [00:05<00:00, 10.83it/s]


(93859, 10)


Unnamed: 0,Job ID,Employer,First Seen Date,Title,Occupation Name,City,State,Job_Type,Education,Description
0,1,ACCENTURE,2023-08-03,Application Developer,"Software Developers, Applications",Multiple locations,,"Full-Time,Permanent",Bachelor's Degree,Experience:
1,2,ACCENTURE,2023-07-28,SnapLogic Application Lead,"Software Developers, Applications",Pune,Mahārāshtra,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...
2,3,ACCENTURE,2023-07-28,Machine Learning Application Architect,"Software Developers, Applications",Bangalore,Karnātaka,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...
3,4,ACCENTURE,2023-07-28,Oracle Procedural Language Extensions to SQL (...,"Software Developers, Applications",Chennai,Tamil Nādu,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...
4,5,ACCENTURE,2023-07-29,ESRI ArcGIS Application Developer,"Software Developers, Applications",Pune,Mahārāshtra,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93859 entries, 0 to 93858
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Job ID           93859 non-null  int64         
 1   Employer         93859 non-null  object        
 2   First Seen Date  93859 non-null  datetime64[ns]
 3   Title            93859 non-null  object        
 4   Occupation Name  93859 non-null  object        
 5   City             93859 non-null  object        
 6   State            87251 non-null  object        
 7   Job_Type         93859 non-null  object        
 8   Education        93859 non-null  object        
 9   Description      93859 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 7.2+ MB


In [6]:
if os.path.isfile("data/job_description_combined.csv"):
    print("File already exists")
else:
    df.to_csv("data/job_description_combined.csv", index=False)
    print("File saved")

File already exists


### EDA

In [7]:
print(f"Total number of Job Descriptions: {df.shape[0]}")

Total number of Job Descriptions: 93859


In [8]:
print(f"Number of companies considered: {df.Employer.nunique()}")

Number of companies considered: 47


In [9]:
print(df['First Seen Date'].min())
print(df['First Seen Date'].max())

2021-08-06 00:00:00
2023-08-05 00:00:00


In [10]:
fig = px.bar(df.Employer.value_counts().head(10), orientation="h")
fig.update_layout(xaxis_title="# JDs", 
                yaxis_title="Company",
                title="Distribution of JDs across Top 10 companies"
                )
fig.show()

In [11]:
fig = px.bar(df.Education.value_counts())
fig.update_layout(xaxis_title="# JDs", 
                yaxis_title="Degree",
                title="Distribution of JDs across degrees"
                )
fig.show()

In [12]:
fig = px.bar(df["Occupation Name"].value_counts())
fig.update_layout(xaxis_title="# JDs", 
                yaxis_title="Degree",
                title="Distribution of JDs across degrees"
                )
fig.show()

In [15]:
df.head(2)

Unnamed: 0,Job ID,Employer,First Seen Date,Title,Occupation Name,City,State,Job_Type,Education,Description
0,1,ACCENTURE,2023-08-03,Application Developer,"Software Developers, Applications",Multiple locations,,"Full-Time,Permanent",Bachelor's Degree,Experience:
1,2,ACCENTURE,2023-07-28,SnapLogic Application Lead,"Software Developers, Applications",Pune,Mahārāshtra,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...


In [31]:
df["approx_tokens"] = df.Description.apply(lambda text: round(len(text)/4))
df_fil = df[(df.approx_tokens >= 100) & (df.approx_tokens <= 1500)].reset_index(drop=True)
print(df_fil.shape)
df.head()

(77702, 11)


Unnamed: 0,Job ID,Employer,First Seen Date,Title,Occupation Name,City,State,Job_Type,Education,Description,approx_tokens
0,1,ACCENTURE,2023-08-03,Application Developer,"Software Developers, Applications",Multiple locations,,"Full-Time,Permanent",Bachelor's Degree,Experience:,3
1,2,ACCENTURE,2023-07-28,SnapLogic Application Lead,"Software Developers, Applications",Pune,Mahārāshtra,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...,480
2,3,ACCENTURE,2023-07-28,Machine Learning Application Architect,"Software Developers, Applications",Bangalore,Karnātaka,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...,657
3,4,ACCENTURE,2023-07-28,Oracle Procedural Language Extensions to SQL (...,"Software Developers, Applications",Chennai,Tamil Nādu,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...,471
4,5,ACCENTURE,2023-07-29,ESRI ArcGIS Application Developer,"Software Developers, Applications",Pune,Mahārāshtra,"Full-Time,Permanent",Bachelor's Degree,Job Description About Accenture: Accenture is ...,543


### Create Sample Data

In [36]:
df_sample = df_fil.sample(1200, random_state=100).reset_index(drop=True)
print(df_sample.shape)
df_sample.head()

(1200, 11)


Unnamed: 0,Job ID,Employer,First Seen Date,Title,Occupation Name,City,State,Job_Type,Education,Description,approx_tokens
0,43989,Infosys Technologies,2022-10-12,BigData Hadoop Production Support Lead,"Software Developers, Applications",Richardson,Texas,"Full-Time,Permanent",Bachelor's Degree,1889319 No 5439 2037 1 10/12/2022 BigData Hado...,954
1,61859,META,2023-05-27,"Software Engineer, front end","Software Developers, Applications",Multiple locations,,"Full-Time,Permanent",Bachelor's Degree,We are the teams who create all of Meta's prod...,773
2,57114,Leidos,2023-01-20,Senior UI/UX Software Engineer,"Software Developers, Applications",Herndon,Virginia,"Full-Time,Permanent",Bachelor's Degree,Description Job Description: Job Description L...,872
3,2701,ACCENTURE,2023-02-20,Recruit for thr post of Spring Boot Applicatio...,"Software Developers, Applications",Indore,Madhya Pradesh,"Full-Time,Permanent",Bachelor's Degree,About Accenture: Accenture is a global profess...,417
4,10993,American Express,2022-05-03,Principal UX Designer - Identity & Access Mana...,"Software Developers, Applications",Springfield,Illinois,"Full-Time,Permanent",Bachelor's Degree,Description You Lead the Way. We’ve Got Your B...,1405


In [50]:
print("Companies we are covering in analysis")
for i, comp in enumerate(df_sample.Employer.unique()):
    print(f"\t{i+1}. {comp}")

Companies we are covering in analysis
	1. Infosys Technologies
	2. META
	3. Leidos
	4. ACCENTURE
	5. American Express
	6. Nvidia Corporation
	7. Shell
	8. Apple
	9. Qualcomm
	10. State Street Corporation
	11. Intel
	12. Amazon
	13. Bank Of America
	14. Lockheed Martin
	15. Cognizant Technology Solutions
	16. JPMorgan Chase & Co.
	17. Google
	18. Capital One
	19. Siemens
	20. Wells Fargo
	21. Flipkart
	22. SAP
	23. Ernst & Young
	24. IBM
	25. GOLDMAN SACHS
	26. Oracle
	27. DELL
	28. Paytm
	29. S&P Global Inc.
	30. Tata Consultancy Services
	31. Walmart
	32. The Boeing Company
	33. Morgan Stanley
	34. Unitedhealth Group
	35. CITI
	36. Tata Group
	37. KPMG
	38. NTT DATA
	39. Lyft
	40. Tesla Motors
	41. Target Corporation
	42. Microsoft
	43. Robinhood
	44. Fidelity Investments


In [46]:
train, test = train_test_split(df_sample, test_size=.15, random_state=100)
print(train.shape)
print(test.shape)

(1020, 11)
(180, 11)


In [48]:
train.to_csv("data/job_description_train.csv", index=False)
test.to_csv("data/job_description_test.csv", index=False)

In [60]:
train[train.Description.str.contains("\$") & (train.Description.str.lower().str.contains("salary"))]

Unnamed: 0,Job ID,Employer,First Seen Date,Title,Occupation Name,City,State,Job_Type,Education,Description,approx_tokens
356,66525,Nvidia Corporation,2023-07-17,"Principal Distributed Systems Engineer, AI inf...","Software Developers, Applications",Multiple locations,,"Full-Time,Permanent",Master's Degree,We are now looking for a Principal Distributed...,1052
753,79209,Target Corporation,2023-06-30,Senior Engineer,"Software Developers, Applications",Minneapolis,Minnesota,"Full-Time,Permanent",Bachelor's Degree,"The pay range is: $130,998.00 - $150,800.00 In...",774
587,45318,Intel,2023-01-27,EDA Tools Software Engineer Intern,"Software Developers, Applications",Nashville,Tennessee,"Full-Time,Internship",Master's Degree,"Intel Posted Under: Nashville, Tennessee jobs ...",1317
562,59598,Lockheed Martin,2023-02-09,Software Engineer,"Software Developers, Applications",Huntsville,Alabama,"Full-Time,Permanent",Bachelor's Degree,experience: Not Specified Job ID: 629268BR Dat...,1298
389,86715,Tesla Motors,2023-05-23,Senior Software Engineer - Supercharger,"Software Developers, Applications",Palo Alto,California,"Full-Time,Permanent",Bachelor's Degree,What to Expect We are the SuperchargerSoftware...,952
...,...,...,...,...,...,...,...,...,...,...,...
437,66670,Nvidia Corporation,2023-07-07,Senior Software Platform Integration Engineer ...,"Software Developers, Applications",Santa Clara,California,"Full-Time,Permanent",Bachelor's Degree,The NVIDIA GPU Cloud (NGC) organization is loo...,1067
63,32987,Google,2022-04-09,"Software Engineer III, Google Photos","Software Developers, Applications",Unavailable,California,"Full-Time,Permanent",Bachelor's Degree,Minimum qualifications: - Bachelor’s degree or...,708
478,30770,Google,2022-12-09,"Software Engineer III, Mobile (Android), Geo","Software Developers, Applications",Unavailable,Unavailable,"Full-Time,Permanent",Bachelor's Degree,Minimum qualifications: - Bachelor’s degree or...,946
2,57114,Leidos,2023-01-20,Senior UI/UX Software Engineer,"Software Developers, Applications",Herndon,Virginia,"Full-Time,Permanent",Bachelor's Degree,Description Job Description: Job Description L...,872
