In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Read and import data science job listings dataset
data_science_jobs_df = pd.read_csv("resources/data_cleaned_2021.csv")
data_science_jobs_df

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,tensor,hadoop,tableau,bi,flink,mongo,google_an,job_title_sim,seniority_by_title,Degree
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 - 1000,1973,...,0,0,1,1,0,0,0,data scientist,na,M
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+,1984,...,0,0,0,0,0,0,0,data scientist,na,M
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 - 1000,2010,...,0,0,0,0,0,0,0,data scientist,na,M
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 - 5000,1965,...,0,0,0,0,0,0,0,data scientist,na,na
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 - 200,1998,...,0,0,0,0,0,0,0,data scientist,na,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,950,"Sr Scientist, Immuno-Oncology - Oncology",$58K-$111K (Glassdoor est.),Site Name: USA - Massachusetts - Cambridge\nPo...,3.9,GSK\n3.9,"Cambridge, MA","Brentford, United Kingdom",10000+,1830,...,0,0,0,0,0,0,0,other scientist,sr,M
738,951,Senior Data Engineer,$72K-$133K (Glassdoor est.),THE CHALLENGE\nEventbrite has a world-class da...,4.4,Eventbrite\n4.4,"Nashville, TN","San Francisco, CA",1001 - 5000,2006,...,0,1,0,0,0,0,0,data engineer,sr,na
739,952,"Project Scientist - Auton Lab, Robotics Institute",$56K-$91K (Glassdoor est.),The Auton Lab at Carnegie Mellon University is...,2.6,Software Engineering Institute\n2.6,"Pittsburgh, PA","Pittsburgh, PA",501 - 1000,1984,...,0,0,0,0,0,0,0,other scientist,na,P
740,953,Data Science Manager,$95K-$160K (Glassdoor est.),Data Science ManagerResponsibilities:\n\nOvers...,3.2,"Numeric, LLC\n3.2","Allentown, PA","Chadds Ford, PA",1 - 50,-1,...,0,0,0,0,0,0,0,data scientist,na,na


In [3]:
# Showing all columns to see which are relevant to our analyis
data_science_jobs_df.columns

Index(['index', 'Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'Hourly', 'Employer provided', 'Lower Salary', 'Upper Salary',
       'Avg Salary(K)', 'company_txt', 'Job Location', 'Age', 'Python',
       'spark', 'aws', 'excel', 'sql', 'sas', 'keras', 'pytorch', 'scikit',
       'tensor', 'hadoop', 'tableau', 'bi', 'flink', 'mongo', 'google_an',
       'job_title_sim', 'seniority_by_title', 'Degree'],
      dtype='object')

In [5]:
# For purposes of data analysis, only going to keep relevant columns
data_analysis_df = data_science_jobs_df.drop(columns = ['Salary Estimate','Job Description','Headquarters','Revenue','Competitors',
                                                         'company_txt','spark','aws','sql','sas','keras','pytorch','scikit','tensor','hadoop',
                                                        'tableau','bi','flink','mongo','google_an','job_title_sim','Hourly','Employer provided',
                                                        'Python','excel','Age','Lower Salary','Upper Salary', 'Type of ownership','Founded'
                                                        ])
# Dropped all 'sr' positions in the 'seniority_by_title' column
data_analysis_df.drop(data_analysis_df[data_analysis_df['seniority_by_title']=='sr'].index,inplace = True)

# Dropped all positions that require a master's degree 'M' and PHD degree 'P' in the 'Degree' column
data_analysis_df.drop(data_analysis_df[data_analysis_df['Degree']=='M'].index,inplace = True)
data_analysis_df.drop(data_analysis_df[data_analysis_df['Degree']=='P'].index,inplace = True)

# Dropped rows with an unknown company size
data_analysis_df.drop(data_analysis_df[data_analysis_df['Size']=='unknown'].index,inplace = True)

In [6]:
#Made list of all jobs to easily pick out which ones we shouldn't include
counter = 0
for job in data_analysis_df['Job Title'].unique():
    counter += 1
    print("{}.{}".format(counter,job))

1.Data Scientist
2.Data Analyst
3.Data Engineer I
4.Data Scientist - Health Data Analytics
5.Digital Health Data Scientist
6.Associate Data Analyst
7.Data Scientist / Machine Learning Expert
8.Web Data Analyst
9.Data Engineer
10.College Hire - Data Scientist - Open to December 2019 Graduates
11.Data Scientist, Office of Data Science
12.Data Science Analyst
13.Data Scientist - Research
14.Analytics Consultant
15.Jr. Business Data Analyst
16.Data Management Specialist
17.E-Commerce Data Analyst
18.Insurance Data Scientist
19.Data Modeler
20.Data Analyst / Scientist
21.Data Scientist, Rice University
22.Financial Data Analyst
23.Ag Data Scientist
24.Project Scientist
25.Data Analytics Manager
26.Machine Learning Engineer
27.Data Analyst - Asset Management
28.MongoDB Data Engineer II
29.Medical Lab Scientist
30.Risk and Analytics IT, Data Scientist
31.Analytics Manager
32.Digital Marketing & ECommerce Data Analyst
33.MED TECH/LAB SCIENTIST - LABORATORY
34.VP, Data Science
35.Radar Data Ana

In [8]:
# Picking out unwanted job titles from the list of jobs above and picking them based on not
# sounding entry level and any not relevant

unwanted_job_titles = ['College Hire - Data Scientist - Open to December 2019 Graduates',
                       'Data Analytics Manager','MongoDB Data Engineer II','VP, Data Science',
                      'Staff Machine Learning Engineer','Staff Scientist',
                       'Director - Data, Privacy and AI Governance',
                      'Associate Data Analyst- Graduate Development Program','IT - Data Engineer II',
                      'Staff Scientist- Upstream PD','Director Data Science',
                       'Associate Machine Learning Engineer / Data Scientist May 2020 Undergrad',
                      'Data Science Manager','Data Analyst 2 (Missionary Department)','Manager of Data Science',
                      'Quality Control Scientist III- Analytical Development',
                       'Software Engineer Staff Scientist: Human Language Technologies',
                      'Data Scientist / Machine Learning Expert','Data Management Specialist','Project Scientist',
                      'Business Data Analyst, SQL','Systems Engineer II - Data Analyst',
                      'Manager, Safety Scientist, Medical Safety & Risk Management','Revenue Analytics Manager',
                      'Associate Data Analyst','Associate Scientist, LC/MS Biologics',
                      'Corporate Risk Data Analyst (SQL Based) - Milwaukee or',
                      'Marketing Data Analyst, May 2020 Undergrad','IT Associate Data Analyst','Data Science Manager'
                      ]


# Dropping all the rows with positions in the unwanted_job_titles list
for unwanted in unwanted_job_titles:
    data_analysis_df.drop(data_analysis_df[data_analysis_df['Job Title'] == unwanted].index,inplace = True)


data_analysis_df

Unnamed: 0,index,Job Title,Rating,Company Name,Location,Size,Industry,Sector,Avg Salary(K),Job Location,seniority_by_title,Degree
3,3,Data Scientist,3.8,PNNL\n3.8,"Richland, WA",1001 - 5000,Energy,"Oil, Gas, Energy & Utilities",76.5,WA,na,na
4,4,Data Scientist,2.9,Affinity Solutions\n2.9,"New York, NY",51 - 200,Advertising & Marketing,Business Services,114.5,NY,na,na
5,5,Data Scientist,3.4,CyrusOne\n3.4,"Dallas, TX",201 - 500,Real Estate,Real Estate,95.0,TX,na,na
6,6,Data Scientist,4.1,ClearOne Advantage\n4.1,"Baltimore, MD",501 - 1000,Banks & Credit Unions,Finance,73.5,MD,na,na
9,9,Data Scientist,4.6,<intent>\n4.6,"New York, NY",51 - 200,Internet,Information Technology,140.0,NY,na,na
...,...,...,...,...,...,...,...,...,...,...,...,...
711,919,MED TECH/LAB SCIENTIST- SOUTH COASTAL LAB,3.6,Beebe Healthcare\n3.6,"Millville, DE",1001 - 5000,Health Care Services & Hospitals,Health Care,56.5,DE,na,na
714,924,Data Scientist,3.2,"Numeric, LLC\n3.2","Philadelphia, PA",1 - 50,Staffing & Outsourcing,Business Services,128.5,PA,na,na
715,926,Scientist - Analytical Services,3.1,Reynolds American\n3.1,"Winston-Salem, NC",5001 - 10000,Consumer Products Manufacturing,Manufacturing,99.5,NC,na,na
724,936,ENVIRONMENTAL ENGINEER/SCIENTIST,3.3,Mcphail Associates\n3.3,"Cambridge, MA",1 - 50,Construction,"Construction, Repair & Maintenance",55.0,MA,na,na


In [10]:
# Exporting data_analysis_df to resources folder to then be used in the data analysis file
#data_analysis_df.to_csv("resources/data_analysis_df.csv")