# Loading Data

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv("employee_salary_data.csv")
data.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


# EDA (Exploratory Data Analysis)

In [2]:
# printing column names
print(f"Column names in dataset are: {list(data.columns)} \n")

# printing shape of data
print(f"Shape of dataset is: {data.shape} \n")

# printing data type of every column in dataset
print(f"Description of columns: \n{data.dtypes} \n")

# checking missing values in every column
print(f"Missing Values: \n{data.isnull().sum()} \n")

Column names in dataset are: ['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience', 'Salary'] 

Shape of dataset is: (375, 6) 

Description of columns: 
Age                    float64
Gender                  object
Education Level         object
Job Title               object
Years of Experience    float64
Salary                 float64
dtype: object 

Missing Values: 
Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64 



# Data Preprocessing

### Handling Missing Values

In [3]:
data = data.dropna()

# checking missing values in every column
print(f"Missing Values: \n{data.isnull().sum()} \n")

Missing Values: 
Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64 



### Handling Categorical Variables

In [4]:
# priniting categorical columns
print(f"Categorical columns in dataset: {list(data.select_dtypes(exclude='number').columns)} \n")

# printing unique values
cat_columns_list = list(data.select_dtypes(exclude='number').columns)
for cat_column in cat_columns_list:
    print(f"Uniques Values in {cat_column}: {data[cat_column].unique()} \n")

Categorical columns in dataset: ['Gender', 'Education Level', 'Job Title'] 

Uniques Values in Gender: ['Male' 'Female'] 

Uniques Values in Education Level: ["Bachelor's" "Master's" 'PhD'] 

Uniques Values in Job Title: ['Software Engineer' 'Data Analyst' 'Senior Manager' 'Sales Associate'
 'Director' 'Marketing Analyst' 'Product Manager' 'Sales Manager'
 'Marketing Coordinator' 'Senior Scientist' 'Software Developer'
 'HR Manager' 'Financial Analyst' 'Project Manager' 'Customer Service Rep'
 'Operations Manager' 'Marketing Manager' 'Senior Engineer'
 'Data Entry Clerk' 'Sales Director' 'Business Analyst' 'VP of Operations'
 'IT Support' 'Recruiter' 'Financial Manager' 'Social Media Specialist'
 'Software Manager' 'Junior Developer' 'Senior Consultant'
 'Product Designer' 'CEO' 'Accountant' 'Data Scientist'
 'Marketing Specialist' 'Technical Writer' 'HR Generalist'
 'Project Engineer' 'Customer Success Rep' 'Sales Executive' 'UX Designer'
 'Operations Director' 'Network Engineer' 'Adm

In [5]:
# replacing text values in categorical columns with numbers
data.replace({'Gender': {'Male':0,'Female':1},
                       'Education Level': {"Bachelor's":1,"Master's":2, "PhD":3}},
                       inplace=True)

# printing data type of every column in dataset
print(f"Description of columns: \n{data.dtypes} \n")

Description of columns: 
Age                    float64
Gender                   int64
Education Level          int64
Job Title               object
Years of Experience    float64
Salary                 float64
dtype: object 



# Splitting (Target and Features, Train and Test)

In [6]:
x = data.drop(columns = ['Job Title', 'Salary'],axis=1)
y = data['Salary']
print(f"Shape of features: {x.shape}")
print(f"Shape of target: {y.shape}")

Shape of features: (373, 4)
Shape of target: (373,)


In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
print(f"Shape of x_train: {x_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of x_test: {x_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of x_train: (298, 4)
Shape of y_train: (298,)
Shape of x_test: (75, 4)
Shape of y_test: (75,)


# Model Training and Validation

In [10]:
from sklearn.tree import DecisionTreeRegressor
model_DecisionTreeRegressor = DecisionTreeRegressor(random_state = 42)
model_DecisionTreeRegressor.fit(x_train, y_train)
prediction_x_test_DecisionTreeRegressor = model_DecisionTreeRegressor.predict(x_test)

from sklearn.linear_model import LinearRegression
model_LinearRegression = LinearRegression()
model_LinearRegression.fit(x_train, y_train)
prediction_x_test_LinearRegression = model_LinearRegression.predict(x_test)

from sklearn.ensemble import RandomForestRegressor
model_RandomForestRegressor = RandomForestRegressor()
model_RandomForestRegressor.fit(x_train, y_train)
prediction_x_test_RandomForestRegressor = model_RandomForestRegressor.predict(x_test)

from sklearn.metrics import r2_score, mean_squared_log_error
print(f"r2_score of LinearRegression on test set is: {r2_score(y_test, prediction_x_test_LinearRegression)}")
print(f"r2_score of RandomForestRegressor on test set is: {r2_score(y_test, prediction_x_test_RandomForestRegressor)}")
print(f"r2_score of DecisionTreeRegressor on test set is: {r2_score(y_test, prediction_x_test_DecisionTreeRegressor)}")

r2_score of LinearRegression on test set is: 0.8674131457833287
r2_score of RandomForestRegressor on test set is: 0.865581208243072
r2_score of DecisionTreeRegressor on test set is: 0.8179913613522798


# Making Prediction System

In [11]:
x_train.head()

Unnamed: 0,Age,Gender,Education Level,Years of Experience
226,46.0,0,3,18.0
100,31.0,1,1,5.0
272,37.0,0,1,8.0
108,41.0,1,2,14.0
346,35.0,1,1,8.0


In [12]:
y_train

226    160000.0
100     45000.0
272     90000.0
108    100000.0
346     85000.0
         ...   
301     55000.0
22      50000.0
72     190000.0
15     125000.0
168     50000.0
Name: Salary, Length: 298, dtype: float64

In [13]:
input_data = np.asarray([46.0, 0, 3, 18.0]).reshape(1,-1)
model_RandomForestRegressor.predict(input_data)[0]



156650.23809523805

In [14]:
model_LinearRegression.predict(input_data)[0]



172969.28469949437

In [15]:
model_DecisionTreeRegressor.predict(input_data)[0]



156666.66666666666

# Exporting Models

In [16]:
import pickle 
pickle.dump(model_DecisionTreeRegressor, open("models/model_DecisionTreeRegressor_predictor.pkl", "wb"))

In [18]:
pickle.dump(model_LinearRegression, open("models/model_LinearRegression_predictor.pkl", "wb"))

In [19]:
pickle.dump(model_RandomForestRegressor, open("models/model_RandomForestRegressor_predictor.pkl", "wb"))
