## Import libraries and load data

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression
import joblib
dataset = pd.read_csv('Salary_Data.csv')
# load dataset
dataset

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


## Check null values in each column

In [45]:
dataset.isnull().sum()

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

## Fill the null values

In [46]:
# Handling missing values
dataset['Age'].fillna(dataset['Age'].mean(), inplace=True)
dataset['Gender'].fillna(dataset['Gender'].mode()[0], inplace=True)
dataset['Education Level'].fillna('Unknown', inplace=True) 
dataset['Job Title'].fillna('Unknown', inplace=True)
dataset['Years of Experience'].fillna(dataset['Years of Experience'].mean(), inplace=True)
dataset['Salary'].fillna(dataset['Salary'].mean(), inplace=True)

## Check null values in each column again

In [47]:
dataset.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

## Shape dataset

In [48]:
dataset.shape

(6704, 6)

## Apply Label

In [49]:
# Dictionaries to store mappings
gender_mapping = {}
education_mapping = {}
job_title_mapping = {}

# Creating a LabelEncoder object
lb_make = LabelEncoder()

# List of categorical columns to be transformed
categorical_cols = ["Gender", "Education Level", "Job Title"]

# Applying LabelEncoder on train data
for col in categorical_cols:
    dataset[col] = lb_make.fit_transform(dataset[col])
    # Creating mapping dictionaries
    mapping = dict(zip(lb_make.classes_, lb_make.transform(lb_make.classes_)))

    # Assigning the mapping to respective dictionaries
    if col == "Gender":
        gender_mapping = mapping
    elif col == "Education Level":
        education_mapping = mapping
    elif col == "Job Title":
        job_title_mapping = mapping

dataset

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,1,0,177,5.0,90000.0
1,28.0,0,3,18,3.0,65000.0
2,45.0,1,5,145,15.0,150000.0
3,36.0,0,0,116,7.0,60000.0
4,52.0,1,3,26,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,0,5,34,20.0,200000.0
6700,32.0,1,2,116,3.0,50000.0
6701,30.0,0,1,42,4.0,55000.0
6702,46.0,1,4,97,14.0,140000.0


In [28]:
gender_mapping

{'Female': 0, 'Male': 1, 'Other': 2}

In [29]:
education_mapping

{"Bachelor's": 0,
 "Bachelor's Degree": 1,
 'High School': 2,
 "Master's": 3,
 "Master's Degree": 4,
 'PhD': 5,
 'Unknown': 6,
 'phD': 7}

In [30]:
job_title_mapping

{'Account Manager': 0,
 'Accountant': 1,
 'Administrative Assistant': 2,
 'Back end Developer': 3,
 'Business Analyst': 4,
 'Business Development Manager': 5,
 'Business Intelligence Analyst': 6,
 'CEO': 7,
 'Chief Data Officer': 8,
 'Chief Technology Officer': 9,
 'Content Marketing Manager': 10,
 'Copywriter': 11,
 'Creative Director': 12,
 'Customer Service Manager': 13,
 'Customer Service Rep': 14,
 'Customer Service Representative': 15,
 'Customer Success Manager': 16,
 'Customer Success Rep': 17,
 'Data Analyst': 18,
 'Data Entry Clerk': 19,
 'Data Scientist': 20,
 'Delivery Driver': 21,
 'Developer': 22,
 'Digital Content Producer': 23,
 'Digital Marketing Manager': 24,
 'Digital Marketing Specialist': 25,
 'Director': 26,
 'Director of Business Development': 27,
 'Director of Data Science': 28,
 'Director of Engineering': 29,
 'Director of Finance': 30,
 'Director of HR': 31,
 'Director of Human Capital': 32,
 'Director of Human Resources': 33,
 'Director of Marketing': 34,
 'D

In [31]:
# Apply StandardScaler to numerical features
numeric_features = ['Age', 'Years of Experience']
numeric_transformer = StandardScaler()

In [32]:
# Combine transformers using ColumnTransformer
ct = ColumnTransformer(
    transformers=[
        ('scaler', numeric_transformer, numeric_features)
    ], 
    remainder='passthrough'
)

## Define features (X) and target variable (y)

In [33]:
# Split the data into features (X) and target variable (y)
X = dataset[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']]
y = dataset['Salary']

In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [35]:
# Apply transformations to the training set
X_train = ct.fit_transform(X_train)

In [36]:
# Apply the same transformations to the test set
X_test = ct.transform(X_test)

In [37]:
# Initialize the model
model = RandomForestRegressor(random_state=0)

In [38]:
# Train the model
model.fit(X_train, y_train)

In [39]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [40]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 64943748.932206094


In [41]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Absolute Error: 3120.8227296767195
R-squared: 0.9766046139072069


In [51]:
joblib.dump(model, '/Users/ruhuanliao/Fall 2023/AI/PredictProject/Archive/SalaryPrediction.pkl')

['/Users/ruhuanliao/Fall 2023/AI/PredictProject/Archive/SalaryPrediction.pkl']