In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy  as np
import seaborn as sns
sns.set_style("darkgrid")


In [4]:
df = pd.read_csv("ds_salaries.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S


In [5]:
df.isna().sum()

Unnamed: 0            0
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.drop(columns="Unnamed: 0", axis=1, inplace=True)

In [8]:
df.head(2)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           607 non-null    int64 
 1   experience_level    607 non-null    object
 2   employment_type     607 non-null    object
 3   job_title           607 non-null    object
 4   salary              607 non-null    int64 
 5   salary_currency     607 non-null    object
 6   salary_in_usd       607 non-null    int64 
 7   employee_residence  607 non-null    object
 8   remote_ratio        607 non-null    int64 
 9   company_location    607 non-null    object
 10  company_size        607 non-null    object
dtypes: int64(4), object(7)
memory usage: 52.3+ KB


In [10]:
df.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [11]:
y = ['experience_level', 'employment_type','salary_in_usd', 'employee_residence', 'company_location', 'company_size']
df=df[y]
df.head(2)

Unnamed: 0,experience_level,employment_type,salary_in_usd,employee_residence,company_location,company_size
0,MI,FT,79833,DE,DE,L
1,SE,FT,260000,JP,JP,S


In [12]:
df["employee_residence"].value_counts().head()

US    332
GB     44
IN     30
CA     29
DE     25
Name: employee_residence, dtype: int64

In [13]:
def reduce_residence(col, length):
    categories_length={}
    for i in range(len(col)):
        if col.values[i]>=length:
            categories_length[col.index[i]] = col.index[i]
        else:
            categories_length[col.index[i]] = "others"
    return categories_length

In [14]:
reduced_residence = reduce_residence(df.employee_residence.value_counts(), 10)
df["employee_residence"] = df["employee_residence"].map(reduced_residence)

In [15]:
df.head(2)

Unnamed: 0,experience_level,employment_type,salary_in_usd,employee_residence,company_location,company_size
0,MI,FT,79833,DE,DE,L
1,SE,FT,260000,others,JP,S


In [16]:
df["employee_residence"].value_counts()

US        332
others    101
GB         44
IN         30
CA         29
DE         25
FR         18
ES         15
GR         13
Name: employee_residence, dtype: int64

In [17]:
df["employee_residence"].replace({"US": "United States", 
                                   "GB":"Great Britain", 
                                   'IN': "India", 
                                   "CA":"Canada", 
                                   "DE":"Germany",
                                   "FR":"France",
                                  "ES":"Spain",
                                  "GR": "Greece"}, inplace=True)

In [18]:
df["employee_residence"].value_counts()

United States    332
others           101
Great Britain     44
India             30
Canada            29
Germany           25
France            18
Spain             15
Greece            13
Name: employee_residence, dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   experience_level    607 non-null    object
 1   employment_type     607 non-null    object
 2   salary_in_usd       607 non-null    int64 
 3   employee_residence  607 non-null    object
 4   company_location    607 non-null    object
 5   company_size        607 non-null    object
dtypes: int64(1), object(5)
memory usage: 28.6+ KB


In [20]:
df.company_location.value_counts()

US    355
GB     47
CA     30
DE     28
IN     24
FR     15
ES     14
GR     11
JP      6
NL      4
AT      4
PL      4
PT      4
BR      3
LU      3
PK      3
TR      3
DK      3
MX      3
AE      3
AU      3
CZ      2
SI      2
BE      2
CH      2
CN      2
RU      2
IT      2
NG      2
HN      1
CL      1
HU      1
VN      1
IE      1
IL      1
AS      1
CO      1
MY      1
IQ      1
UA      1
IR      1
MT      1
NZ      1
RO      1
MD      1
EE      1
DZ      1
SG      1
KE      1
HR      1
Name: company_location, dtype: int64

In [21]:
reduced_location = reduce_residence(df.company_location.value_counts(), 10)

In [22]:
df["company_location"] = df["company_location"].map(reduced_location)

In [23]:
df["company_location"].value_counts()

US        355
others     83
GB         47
CA         30
DE         28
IN         24
FR         15
ES         14
GR         11
Name: company_location, dtype: int64

In [24]:
df["company_location"].replace({"US": "United States", 
                                   "GB":"Great Britain", 
                                   'IN': "India", 
                                   "CA":"Canada", 
                                   "DE":"Germany",
                                   "FR":"France",
                                  "ES":"Spain",
                                  "GR": "Greece"}, inplace=True)

In [25]:
df["company_location"].value_counts()

United States    355
others            83
Great Britain     47
Canada            30
Germany           28
India             24
France            15
Spain             14
Greece            11
Name: company_location, dtype: int64

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   experience_level    607 non-null    object
 1   employment_type     607 non-null    object
 2   salary_in_usd       607 non-null    int64 
 3   employee_residence  607 non-null    object
 4   company_location    607 non-null    object
 5   company_size        607 non-null    object
dtypes: int64(1), object(5)
memory usage: 28.6+ KB


In [27]:
df.experience_level.value_counts()

SE    280
MI    213
EN     88
EX     26
Name: experience_level, dtype: int64

In [28]:
df["experience_level"].replace({"SE": "Senior Level", 
                                   "MI":"Mid level", 
                                   'EN': "Entry Level", 
                                   "EX":"Executive"}, inplace=True)

In [29]:
df.experience_level.value_counts()

Senior Level    280
Mid level       213
Entry Level      88
Executive        26
Name: experience_level, dtype: int64

In [30]:
df.employment_type.value_counts()

FT    588
PT     10
CT      5
FL      4
Name: employment_type, dtype: int64

In [31]:
df["employment_type"].replace({"FT": "Full Time", 
                                   "PT":"Part Time", 
                                   'CT': "Contract Type", 
                                   "FL":"Freelance"}, inplace=True)

In [32]:
df.employment_type.value_counts()

Full Time        588
Part Time         10
Contract Type      5
Freelance          4
Name: employment_type, dtype: int64

In [33]:
df.company_size.value_counts()

M    326
L    198
S     83
Name: company_size, dtype: int64

In [34]:
df["company_size"].replace({"M": "Medium Size", 
                                   "L":"Large", 
                                   'S': "Small"}, inplace=True)

In [35]:
df.company_size.value_counts()

Medium Size    326
Large          198
Small           83
Name: company_size, dtype: int64

In [36]:
df.head()

Unnamed: 0,experience_level,employment_type,salary_in_usd,employee_residence,company_location,company_size
0,Mid level,Full Time,79833,Germany,Germany,Large
1,Senior Level,Full Time,260000,others,others,Small
2,Senior Level,Full Time,109024,Great Britain,Great Britain,Medium Size
3,Mid level,Full Time,20000,others,others,Small
4,Senior Level,Full Time,150000,United States,United States,Large


In [37]:
X = df.drop(columns="salary_in_usd")
y = df.salary_in_usd

In [38]:
X.head()

Unnamed: 0,experience_level,employment_type,employee_residence,company_location,company_size
0,Mid level,Full Time,Germany,Germany,Large
1,Senior Level,Full Time,others,others,Small
2,Senior Level,Full Time,Great Britain,Great Britain,Medium Size
3,Mid level,Full Time,others,others,Small
4,Senior Level,Full Time,United States,United States,Large


In [39]:
y.head()

0     79833
1    260000
2    109024
3     20000
4    150000
Name: salary_in_usd, dtype: int64

In [40]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score

In [41]:
X.columns

Index(['experience_level', 'employment_type', 'employee_residence',
       'company_location', 'company_size'],
      dtype='object')

In [42]:
transform_columns = make_column_transformer((OneHotEncoder(), ['experience_level', 
                                                               'employment_type', 
                                                               'employee_residence',
                                                               'company_location', 
                                                               'company_size']))

In [43]:
transform_columns.fit_transform(X)

<607x29 sparse matrix of type '<class 'numpy.float64'>'
	with 3035 stored elements in Compressed Sparse Row format>

In [44]:
model_1 = LinearRegression()
model_2 = LassoCV()
model_3 = RidgeCV()
model_4 = RandomForestRegressor()
model_5 = DecisionTreeRegressor()

In [45]:
my_pipeline_1 = make_pipeline(transform_columns, model_1)
my_pipeline_2 = make_pipeline(transform_columns, model_2)
my_pipeline_3 = make_pipeline(transform_columns, model_3)
my_pipeline_4 = make_pipeline(transform_columns, model_4)
my_pipeline_5 = make_pipeline(transform_columns, model_5)

In [46]:
cv_1 = -1 * cross_val_score(my_pipeline_1, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
cv_2 = -1 * cross_val_score(my_pipeline_2, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
cv_3 = -1 * cross_val_score(my_pipeline_3, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
cv_4 = -1 * cross_val_score(my_pipeline_4, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
cv_5 = -1 * cross_val_score(my_pipeline_5, X, y, cv=5, scoring="neg_mean_absolute_error").mean()

In [47]:
print(f"Linear Regression: {cv_1}")
print(f"\nLassoCV:{cv_2}")
print(f"\nRidgeCV: {cv_3}")
print(f"\nRandomForest: {cv_4}")
print(f"\nDecisionTree: {cv_5}")

Linear Regression: 36742.62147403757

LassoCV:36370.24574013852

RidgeCV: 36444.01376731368

RandomForest: 35892.114564245545

DecisionTree: 36844.09680237786


In [48]:
my_pipeline_4.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['experience_level',
                                                   'employment_type',
                                                   'employee_residence',
                                                   'company_location',
                                                   'company_size'])])),
                ('randomforestregressor', RandomForestRegressor())])

In [49]:
my_pipeline_1.fit(X,y)
my_pipeline_2.fit(X,y)
my_pipeline_3.fit(X,y)
my_pipeline_5.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['experience_level',
                                                   'employment_type',
                                                   'employee_residence',
                                                   'company_location',
                                                   'company_size'])])),
                ('decisiontreeregressor', DecisionTreeRegressor())])

In [50]:
X_new = X.sample(5, random_state=99)
X_new

Unnamed: 0,experience_level,employment_type,employee_residence,company_location,company_size
177,Mid level,Full Time,others,others,Large
3,Mid level,Full Time,others,others,Small
10,Entry Level,Full Time,France,France,Small
48,Mid level,Full Time,United States,United States,Large
64,Senior Level,Full Time,others,others,Small


In [51]:
my_pipeline_4.predict(X_new)

array([ 56689.87489969,  39404.32489866,  51127.30967857, 138051.92623471,
       107962.89822203])

In [52]:
my_columns = ['experience_level', 'employment_type', 'employee_residence',
       'company_location', 'company_size']
a = np.array([["Senior Level","Full Time","United States", "United States", "Large"]])
a = pd.DataFrame(a, columns = my_columns)
a

Unnamed: 0,experience_level,employment_type,employee_residence,company_location,company_size
0,Senior Level,Full Time,United States,United States,Large


In [53]:
model_1 = my_pipeline_1
model_2 = my_pipeline_2
model_3 = my_pipeline_3
model_4 = my_pipeline_4
model_5 = my_pipeline_5

In [54]:
my_pipeline_4.predict(a)

array([187371.16541381])

In [55]:
model_1.predict(a)

array([169160.84005872])

In [56]:
import pickle

In [57]:
data = {"model_1":model_1, "model_2":model_2, "model_3":model_3, "model_4":model_4, "model_5":model_5, "columns": my_columns}
with open("data science prediction.pki", "wb") as file:
    pickle.dump(data, file)

In [58]:
with open("data science prediction.pki", "rb") as file:
    data = pickle.load(file)

model_1_prediction= data["model_1"]
model_2_prediction = data["model_2"]
model_3_prediction = data["model_3"]
model_4_predicton = data["model_4"]
model_5_prediction = data["model_5"]

my_column = data["columns"]


In [59]:
model_4_predicton.predict(a)

array([187371.16541381])