# Data Preprocessing Tools

## Importing the libraries

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [17]:
data = pd.read_csv('ds_salaries.csv')
data

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


## Encoding categorical data

### OneHotEncoding


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# [0] signifies the index of the column we are appliying the encoding on
data = pd.DataFrame(ct.fit_transform(data))
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,0.0,1.0,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,0.0,0.0,0.0,1.0,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,0.0,0.0,0.0,1.0,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,0.0,0.0,0.0,1.0,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,0.0,0.0,0.0,1.0,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3750,1.0,0.0,0.0,0.0,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
3751,0.0,1.0,0.0,0.0,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
3752,1.0,0.0,0.0,0.0,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
3753,1.0,0.0,0.0,0.0,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


### LabelEncoding

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.iloc[:,-1] = le.fit_transform(data.iloc[:,-1])
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,0.0,1.0,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,0
1,0.0,0.0,0.0,1.0,MI,CT,ML Engineer,30000,USD,30000,US,100,US,2
2,0.0,0.0,0.0,1.0,MI,CT,ML Engineer,25500,USD,25500,US,100,US,2
3,0.0,0.0,0.0,1.0,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,1
4,0.0,0.0,0.0,1.0,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3750,1.0,0.0,0.0,0.0,SE,FT,Data Scientist,412000,USD,412000,US,100,US,0
3751,0.0,1.0,0.0,0.0,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,0
3752,1.0,0.0,0.0,0.0,EN,FT,Data Scientist,105000,USD,105000,US,100,US,2
3753,1.0,0.0,0.0,0.0,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,0


## Splitting the dataset

In [20]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
print("Independent Variable\n")
print(X)
print("\nDependent Variable\n")
print(y)

Independent Variable

[[0.0 0.0 0.0 ... 'ES' 100 'ES']
 [0.0 0.0 0.0 ... 'US' 100 'US']
 [0.0 0.0 0.0 ... 'US' 100 'US']
 ...
 [1.0 0.0 0.0 ... 'US' 100 'US']
 [1.0 0.0 0.0 ... 'US' 100 'US']
 [0.0 1.0 0.0 ... 'IN' 50 'IN']]

Dependent Variable

[0 2 2 ... 2 0 0]


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [22]:
print(X_train)

[[0.0 0.0 0.0 ... 'US' 0 'US']
 [0.0 0.0 1.0 ... 'US' 0 'US']
 [0.0 0.0 0.0 ... 'US' 100 'US']
 ...
 [0.0 0.0 1.0 ... 'US' 0 'US']
 [0.0 0.0 0.0 ... 'US' 0 'US']
 [0.0 0.0 0.0 ... 'US' 100 'US']]


In [23]:
print(X_test)

[[0.0 0.0 1.0 ... 'US' 50 'US']
 [0.0 0.0 0.0 ... 'US' 0 'US']
 [0.0 0.0 1.0 ... 'US' 100 'US']
 ...
 [0.0 0.0 0.0 ... 'US' 100 'US']
 [0.0 0.0 1.0 ... 'US' 0 'US']
 [0.0 0.0 0.0 ... 'GB' 100 'GB']]
