# EDA Job Descriptions

In [1]:
from text_data_toolkit import data_cleaning as dc
from text_data_toolkit import data_transformation as dt
from text_data_toolkit import eda
from text_data_toolkit import file_operations as fop
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# check out files in data folder
fop.list_files('../data')

['Restaurant_Reviews_Test.tsv',
 'DataScientist.csv',
 'job_postings.csv',
 'Restaurant_Reviews.csv',
 'job_summary_mini.csv',
 'job_skills.csv',
 'Restaurant_Reviews.tsv',
 'job_summary.csv',
 'Restaurant_Reviews2.txt']

In [3]:
# import data to dfs using method from data_cleaning

files = ['../data/DataScientist.csv',
         '../data/job_summary.csv',
         '../data/job_skills.csv',
         '../data/job_postings.csv']

dfs = dc.load_text_to_df(files, columns = None, line_length = 1)


In [4]:
dfs

{'DataScientist':       Unnamed: 0  index                                          Job Title  \
 0              0      0                              Senior Data Scientist   
 1              1      1                  Data Scientist, Product Analytics   
 2              2      2                               Data Science Manager   
 3              3      3                                       Data Analyst   
 4              4      4                             Director, Data Science   
 ...          ...    ...                                                ...   
 3904        3904   4375                                  AWS Data Engineer   
 3905        3905   4376                              Data Analyst â Junior   
 3906        3906   4377                   Security Analytics Data Engineer   
 3907        3907   4378                   Security Analytics Data Engineer   
 3908        3908   4379  Patient Safety Physician or Safety Scientist -...   
 
                    Salary Estima

In [5]:
#
df_ds_jobs_raw = dfs['DataScientist']
df_ds_jobs_raw.head()

Unnamed: 0.1,Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply
0,0,0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Company - Private,Travel Agencies,Travel & Tourism,Unknown / Non-Applicable,-1,-1
1,1,1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,Company - Private,"Health, Beauty, & Fitness",Consumer Services,Unknown / Non-Applicable,-1,-1
2,2,2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,-1.0,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,Unknown,-1,-1,Unknown / Non-Applicable,-1,True
3,3,3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Zocdoc, Healthgrades",-1
4,4,4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription...",3.4,United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"BBDO, Grey Group, Droga5",-1


In [6]:
df_job_summaries_raw = dfs['job_summary']
df_job_summaries_raw.shape

(12217, 2)

In [7]:
df_job_skills_raw = dfs['job_skills']
df_job_skills_raw.shape

(12217, 2)

In [8]:
df_job_postings_raw = dfs['job_postings']
df_job_postings_raw.shape

(12217, 15)

The job_ csvs have the same amount of rows in all files so we may be able to join these into one df.


In [9]:
print("First index")
print(df_job_summaries_raw.iloc[0, 0])
print(df_job_skills_raw.iloc[0, 0])
print(df_job_postings_raw.iloc[0, 0])
print("Random Index")
print(df_job_summaries_raw.iloc[876, 0])
print(df_job_skills_raw.iloc[876, 0])
print(df_job_postings_raw.iloc[876, 0])


First index
https://www.linkedin.com/jobs/view/senior-machine-learning-engineer-at-jobs-for-humanity-3804053819
https://www.linkedin.com/jobs/view/senior-machine-learning-engineer-at-jobs-for-humanity-3804053819
https://www.linkedin.com/jobs/view/senior-machine-learning-engineer-at-jobs-for-humanity-3804053819
Random Index
https://uk.linkedin.com/jobs/view/senior-data-analyst-financial-crime-at-nala-3805461162
https://uk.linkedin.com/jobs/view/senior-data-analyst-financial-crime-at-nala-3805461162
https://uk.linkedin.com/jobs/view/senior-data-analyst-financial-crime-at-nala-3805461162


Upon double checking, the rows do line up with eachother by index, so this would be an easy join. They could have been joined off of job_link either way

In [10]:
# join the job_ tables
merged_jobs_raw_2 = pd.merge(df_job_summaries_raw, df_job_postings_raw, left_index=True, right_index=True)
merged_jobs_raw = pd.merge(merged_jobs_raw_2, df_job_skills_raw, left_index=True, right_index=True)
print(merged_jobs_raw.shape) # check column count
merged_jobs_raw.head() # check that merge worked

(12217, 19)


Unnamed: 0,job_link_x,job_summary,job_link_y,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_link,job_skills
0,https://www.linkedin.com/jobs/view/senior-mach...,Company Description\nJobs for Humanity is part...,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,https://www.linkedin.com/jobs/view/senior-mach...,"Machine Learning, Programming, Python, Scala, ..."
1,https://www.linkedin.com/jobs/view/principal-s...,Who We Are\nAurora (Nasdaq: AUR) is delivering...,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,https://www.linkedin.com/jobs/view/principal-s...,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,https://www.linkedin.com/jobs/view/senior-etl-...,"Location: New York City, NY\nPosition Summary\...",https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,https://www.linkedin.com/jobs/view/senior-etl-...,"ETL, Data Integration, Data Transformation, Da..."
3,https://www.linkedin.com/jobs/view/senior-data...,Responsibilities:\nCandidate must have signifi...,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,https://www.linkedin.com/jobs/view/senior-data...,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,https://www.linkedin.com/jobs/view/lead-data-e...,Dice is the leading career destination for tec...,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,https://www.linkedin.com/jobs/view/lead-data-e...,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."


# Clean Job Summaries


In [11]:
# clean df of whitespaces and punctuation
merged_jobs_raw_clean_summary = dc.clean_dataframe_no_dups(merged_jobs_raw, "job_summary")
merged_jobs_raw_clean_summary.head()

Unnamed: 0,job_link_x,job_summary,job_link_y,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_link,job_skills
0,https://www.linkedin.com/jobs/view/senior-mach...,company description jobs for humanity is partn...,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,https://www.linkedin.com/jobs/view/senior-mach...,"Machine Learning, Programming, Python, Scala, ..."
1,https://www.linkedin.com/jobs/view/principal-s...,who we are aurora nasdaq aur is delivering the...,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,https://www.linkedin.com/jobs/view/principal-s...,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,https://www.linkedin.com/jobs/view/senior-etl-...,location new york city ny position summary our...,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,https://www.linkedin.com/jobs/view/senior-etl-...,"ETL, Data Integration, Data Transformation, Da..."
3,https://www.linkedin.com/jobs/view/senior-data...,responsibilities candidate must have significa...,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,https://www.linkedin.com/jobs/view/senior-data...,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,https://www.linkedin.com/jobs/view/lead-data-e...,dice is the leading career destination for tec...,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,https://www.linkedin.com/jobs/view/lead-data-e...,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."


In [12]:
# skill count
dt.label_job_skills(merged_jobs_raw_clean_summary, "job_summary", custom_skills=None)

{'react': 315,
 'python': 6746,
 'scikit': 349,
 'sql': 12401,
 'pyspark': 496,
 'tableau': 2263,
 'regex': 10,
 'pytorch': 650,
 'html': 203,
 'javascript': 244,
 'nlp': 630,
 'snowflake': 2295,
 'cloud': 10457}