In [8]:
# import the packages for data manipulation 
import pandas as pd
import numpy as np

# import the packages for machine learning 
from sklearn import linear_model 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score

# import the packages for data management
import joblib


In [9]:
# read the model 
salary_data = pd.read_csv('ds_salaries.csv')


In [10]:
salary_data.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [11]:
# use ordinal encoder to encode experience level
encoder = OrdinalEncoder(categories=[['EN', 'MI', 'SE', 'EX']])
salary_data['experience_level_encoded'] = encoder.fit_transform(salary_data[['experience_level']])

# use ordinal encode to encode company size
encoder = OrdinalEncoder(categories=[['S', 'M', 'L']])
salary_data['company_size_encoded'] = encoder.fit_transform(salary_data[['company_size']])

# encode employment type and job title using dummy columns
salary_data = pd.get_dummies(salary_data, columns =['employment_type', 'job_title'], drop_first = True, dtype =int)

# droping origin columns
salary_data = salary_data.drop(columns=['experience_level', 'company_size'])

In [12]:
# Define independent (X) and dependent (y) features
X = salary_data.drop(columns=['salary_in_usd'])
y = salary_data['salary_in_usd']

X = pd.get_dummies(X, drop_first=True)

# Ensure no missing values
X = X.fillna(0)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=104, test_size=0.2, shuffle=True)

# Fit linear regression model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

# Make predictions
y_pred = regr.predict(X_test)

# Print the coefficients
print('Coefficients: \n', regr.coef_)

# Print the metrics
print('Mean Squared Error (MSE): %.2f' % mean_squared_error(y_test, y_pred))
print('Mean Absolute Error (MAE): %.2f' % np.mean(np.abs(y_test - y_pred)))
print('R2: %.2f' % r2_score(y_test, y_pred))
print('Adjusted R2: %.2f' % (1 - (1 - r2_score(y_test, y_pred)) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)))

Coefficients: 
 [ 2.70791856e+01 -1.03158478e+04  9.29825037e-03  2.53550316e+01
  2.45284923e+04  7.28791488e+03 -3.68436775e+04 -4.11975666e+04
 -7.67877449e+04 -1.09400305e+04 -4.09012289e+04  4.69671465e+04
 -4.36795802e+04 -6.81419265e+04 -3.19273952e+04 -4.84826754e+04
 -5.39712024e+04  3.94967315e+04  5.56693190e+04 -5.66074367e+04
 -6.82440866e+04 -6.13431427e+04 -1.44063961e-08 -6.30930771e+04
 -2.80225215e+03 -4.12546446e+04 -4.06010953e+04 -3.03938594e+04
 -3.59385468e+04 -1.67420459e+04 -3.63644967e+04 -3.42961669e+04
 -1.43591111e+03 -1.67254808e+02 -4.20421445e+04 -9.48546135e+04
  2.74423709e+05  3.68224952e+04 -3.68417285e+04 -6.83727422e+04
 -3.36863782e+04  2.75195484e+04 -2.20490620e+04 -4.19976554e+04
  1.19471722e+04  7.68883902e+03 -2.65168487e+04  9.03460805e+03
 -2.63078059e+04  3.22494996e+03 -7.02783855e+03 -5.82076609e-10
 -6.98947904e+04  1.89316253e+05  1.38896655e+04 -5.27471707e+04
  8.75869845e+03 -1.30843268e+05  1.03208545e+05  6.20251099e+04
  6.49254

In [13]:
# Save the model
joblib.dump(regr, 'lin_regression.sav')

['lin_regression.sav']