# MSA 2023 Phase 2 - Part 1: Regression Dataset

# Data Salaries (data_salaries.csv)

Within the world of data, there are specialisations such as data engineer, machine learning engineer, data analyst, and many more. How much these jobs pay depend on a variety of factors. Your task is to predict the salary of a data-related job given certain information about it.

In [1]:
# Takes around 45 secs to load in all libraries
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Find all variables and understand them

In [2]:
# Load market data into notebook
salary_data = pd.read_csv('data_salaries.csv', delimiter=',',header='infer')

In [3]:
# Display insight into variable types
salary_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [4]:
# Show first ten instances
salary_data.head(10)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
5,2023,SE,FT,Applied Scientist,222200,USD,222200,US,0,US,L
6,2023,SE,FT,Applied Scientist,136000,USD,136000,US,0,US,L
7,2023,SE,FT,Data Scientist,219000,USD,219000,CA,0,CA,M
8,2023,SE,FT,Data Scientist,141000,USD,141000,CA,0,CA,M
9,2023,SE,FT,Data Scientist,147100,USD,147100,US,0,US,M


In [6]:
salary_data.select_dtypes(include = "number").describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


In [7]:
salary_data.select_dtypes(include = "object").describe()

Unnamed: 0,experience_level,employment_type,job_title,salary_currency,employee_residence,company_location,company_size
count,3755,3755,3755,3755,3755,3755,3755
unique,4,4,93,20,78,72,3
top,SE,FT,Data Engineer,USD,US,US,M
freq,2516,3718,1040,3224,3004,3040,3153


In [8]:
salary_data.isnull().sum(axis=0)


work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

## 2. Clean data

In [9]:
from sklearn import preprocessing


# Converting 7 columns that have categorical text info into numbers
label_encoder = preprocessing.LabelEncoder()

salary_data['experience_level'] = label_encoder.fit_transform(salary_data['experience_level'])
salary_data['employment_type'] = label_encoder.fit_transform(salary_data['employment_type'])
salary_data['job_title'] = label_encoder.fit_transform(salary_data['job_title'])
salary_data['salary_currency'] = label_encoder.fit_transform(salary_data['salary_currency'])
salary_data['employee_residence'] = label_encoder.fit_transform(salary_data['employee_residence'])
salary_data['company_location'] = label_encoder.fit_transform(salary_data['company_location'])
salary_data['company_size'] = label_encoder.fit_transform(salary_data['company_size'])

salary_data.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,3,2,84,80000,7,85847,26,100,25,0
1,2023,2,0,66,30000,19,30000,75,100,70,2
2,2023,2,0,66,25500,19,25500,75,100,70,2
3,2023,3,2,47,175000,19,175000,11,100,12,1
4,2023,3,2,47,120000,19,120000,11,100,12,1


## 3. Visualise data

In [None]:
#

## 4. Identify correlated variables

In [None]:
#

## 5. Summary