In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')



df = pd.read_csv('../data/ds_salaries.csv')

In [39]:
employment_type = 'employment_type'
df[employment_type] = df[employment_type].replace('FT','Full-Time')
df[employment_type] = df[employment_type].replace('CT','Contract')
df[employment_type] = df[employment_type].replace('PT','Part-Time')
df[employment_type] = df[employment_type].replace('FL','Freelance')
df[employment_type].value_counts()

employment_type
Full-Time    3718
Part-Time      17
Contract       10
Freelance      10
Name: count, dtype: int64

In [24]:
experience_level = 'experience_level'
df[experience_level] = df[experience_level].replace('EN','Entry-level/Junior')
df[experience_level] = df[experience_level].replace('MI','Mid-level/Intermediate')
df[experience_level] = df[experience_level].replace('SE','Senior-level/Expert')
df[experience_level] = df[experience_level].replace('EX','Executive-level/Director')
df[experience_level].value_counts()

experience_level
Senior-level/Expert         2516
Mid-level/Intermediate       805
Entry-level/Junior           320
Executive-level/Director     114
Name: count, dtype: int64

In [40]:
print(df.shape)
df.head()

(3755, 11)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,Full-Time,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,Contract,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,Contract,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,Full-Time,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,Full-Time,Data Scientist,120000,USD,120000,CA,100,CA,M


In [41]:
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,Full-Time,Principal Data Scientist,85847,ES,100,ES,L
1,2023,MI,Contract,ML Engineer,30000,US,100,US,S
2,2023,MI,Contract,ML Engineer,25500,US,100,US,S
3,2023,SE,Full-Time,Data Scientist,175000,CA,100,CA,M
4,2023,SE,Full-Time,Data Scientist,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...
3750,2020,SE,Full-Time,Data Scientist,412000,US,100,US,L
3751,2021,MI,Full-Time,Principal Data Scientist,151000,US,100,US,L
3752,2020,EN,Full-Time,Data Scientist,105000,US,100,US,S
3753,2020,EN,Contract,Business Data Analyst,100000,US,100,US,L


#  We have a lot of job titles (84) that appear less frequently in our dataset.
- Maybe we could group less frequent job titles into broader categories. For example, job titles with similar roles or in the same domain could be combined. 

In [42]:
print(df['job_title'].value_counts().head(10))
print("Mean of job titles:",df['job_title'].value_counts().mean())
# show job titles with less than 40 occurences
print("Job titles with less than 40:",len(df['job_title'].value_counts()[df['job_title'].value_counts() < 40]))
#print all job titles with more than 40 occurences
print("Job titles with more than 40:",len(df['job_title'].value_counts()[df['job_title'].value_counts() >= 40]))


job_title
Data Engineer                1040
Data Scientist                840
Data Analyst                  612
Machine Learning Engineer     289
Analytics Engineer            103
Data Architect                101
Research Scientist             82
Data Science Manager           58
Applied Scientist              58
Research Engineer              37
Name: count, dtype: int64
Mean of job titles: 40.376344086021504
Job titles with less than 40: 84
Job titles with more than 40: 9


### 分類

In [7]:
job_title_mapping = {
    # Data Engineer 分類
    'Big Data Engineer': 'Data Engineer',
    'Data Infrastructure Engineer': 'Data Engineer',
    'Lead Data Engineer': 'Data Engineer',
    'Cloud Database Engineer': 'Data Engineer',
    'ETL Developer': 'Data Engineer',
    'BI Developer': 'Data Engineer',
    'Data Operations Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer',
    'Azure Data Engineer': 'Data Engineer',
    'Data DevOps Engineer': 'Data Engineer',
    'BI Data Engineer': 'Data Engineer',

    # Data Scientist 分類
    'Machine Learning Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist',
    'AI Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist',
    'Principal Data Scientist': 'Data Scientist',
    'Data Scientist Lead': 'Data Scientist',
    'Product Data Scientist': 'Data Scientist',
    'Compliance Data Analyst': 'Data Scientist', # 注意：這個職位可能更接近 Data Analyst
    'Data Science Tech Lead': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist',

    # Data Analyst 分類
    'Data Manager': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst',
    'Business Data Analyst': 'Data Analyst',
    'Data Specialist': 'Data Analyst',
    'BI Data Analyst': 'Data Analyst',
    'Data Quality Analyst': 'Data Analyst',
    'Product Data Analyst': 'Data Analyst',
    'Financial Data Analyst': 'Data Analyst',
    'Marketing Data Analyst': 'Data Analyst',
    'Data Modeler': 'Data Analyst',
    'Principal Data Analyst': 'Data Analyst',
    'Insight Analyst': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    'Manager Data Management': 'Data Analyst',
    'Finance Data Analyst': 'Data Analyst',

    # Machine Learning Engineer 分類
    'Machine Learning Infrastructure Engineer': 'Machine Learning Engineer',
    'Machine Learning Software Engineer': 'Machine Learning Engineer',
    'Machine Learning Developer': 'Machine Learning Engineer',
    'Deep Learning Engineer': 'Machine Learning Engineer',
    'Machine Learning Research Engineer': 'Machine Learning Engineer',
    'Lead Machine Learning Engineer': 'Machine Learning Engineer',
    'Applied Machine Learning Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer',

    # Analytics Engineer 分類
    'Data Analytics Engineer': 'Analytics Engineer',
    'Lead Data Analyst': 'Analytics Engineer', # 注意：這個職位可能更接近 Data Analyst
    'Data Analytics Consultant': 'Analytics Engineer',
    'Data Analytics Lead': 'Analytics Engineer',

    # Data Architect 分類
    'Big Data Architect': 'Data Architect',
    'Principal Data Architect': 'Data Architect',
    'Cloud Data Architect': 'Data Architect',

    # Research Scientist 分類
    'Machine Learning Researcher': 'Research Scientist',
    '3D Computer Vision Researcher': 'Research Scientist',
    'Deep Learning Researcher': 'Research Scientist',

    # Data Science Manager 分類
    'Director of Data Science': 'Data Science Manager',
    'Head of Data': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager',
    'Machine Learning Manager': 'Data Science Manager',
    'Head of Machine Learning': 'Data Science Manager',

    # Applied Scientist 分類
    'Applied Machine Learning Scientist': 'Applied Scientist',
    'Computer Vision Engineer': 'Applied Scientist',
    'AI Developer': 'Applied Scientist',
    'NLP Engineer': 'Applied Scientist',
    'Computer Vision Software Engineer': 'Applied Scientist',
    'Autonomous Vehicle Technician': 'Applied Scientist',

    # Research Engineer 分類
    'MLOps Engineer': 'Research Engineer',
    'Business Intelligence Engineer': 'Research Engineer',
    'Data Operations Analyst': 'Research Engineer',
    'Cloud Data Engineer': 'Research Engineer',
    'AI Programmer': 'Research Engineer',
    'Power BI Developer': 'Research Engineer',
    'Data Management Specialist': 'Research Engineer',
    'Marketing Data Engineer': 'Research Engineer',
    'Data Strategist': 'Research Engineer',


    'BI Analyst': 'Data Analyst',
    'Lead Data Scientist': 'Data Science Manager',
    'Data Science Lead': 'Data Science Manager',
    'Data Science Engineer': 'Data Scientist',
    'Data Lead': 'Data Science Manager', # 或 'Data Engineer' 取決於具體職責
    'Principal Data Engineer': 'Data Engineer',
    'Software Data Engineer': 'Data Engineer',
    'ETL Engineer': 'Data Engineer',
    'Staff Data Analyst': 'Data Analyst'
}


In [8]:
df['job_title'] = df['job_title'].replace(job_title_mapping)

In [7]:
print(df['job_title'].value_counts())

job_title
Data Engineer                          1040
Data Scientist                          840
Data Analyst                            612
Machine Learning Engineer               289
Analytics Engineer                      103
                                       ... 
Principal Machine Learning Engineer       1
Azure Data Engineer                       1
Manager Data Management                   1
Marketing Data Engineer                   1
Finance Data Analyst                      1
Name: count, Length: 93, dtype: int64


In [10]:
# save csv
df.to_csv('../data/ds_salaries_cleaned.csv', index = False)

In [48]:
top15_job_titles = df['job_title'].value_counts()[:10]
fig = px.bar(y = top15_job_titles.values, x = top15_job_titles.index, 
            text = top15_job_titles.values, title = 'Top 10 Job Designations')
fig.update_layout(xaxis_title = "Job Designations", yaxis_title = "Count")
fig.show()

# I prepared the features (X) and the target variable (y) for a machine learning model,

In [43]:
# Step 1: Preprocess the data
X = df.drop(['salary_in_usd'], axis=1)
y = df['salary_in_usd']

In [44]:
# print X first 5 rows
X.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,Full-Time,Principal Data Scientist,ES,100,ES,L
1,2023,MI,Contract,ML Engineer,US,100,US,S
2,2023,MI,Contract,ML Engineer,US,100,US,S
3,2023,SE,Full-Time,Data Scientist,CA,100,CA,M
4,2023,SE,Full-Time,Data Scientist,CA,100,CA,M


In [45]:
#print y first 5 rows
y.head()

0     85847
1     30000
2     25500
3    175000
4    120000
Name: salary_in_usd, dtype: int64

# This block of code includes several steps in data preprocessing for a machine learning model. It one-hot encodes and standardizes

In [46]:


# One-hot encode categorical columns 類別型變數轉換成數值型態
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
onehotencoder = OneHotEncoder()
X_categorical = onehotencoder.fit_transform(X[categorical_columns]).toarray()

# Standardize numerical columns 用來標準化數值型數據
numerical_columns = ['work_year', 'remote_ratio'] 
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X[numerical_columns])

# Concatenate the numerical and categorical features
X_processed = np.concatenate((X_numerical, X_categorical), axis=1)

# Convert target to a suitable format
y = y.values.reshape(-1, 1)
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_scaled, test_size=0.2, random_state=42)


#  the process of converting the training and testing sets for both features and target variables into PyTorch tensors, 

In [47]:

# Convert arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Step 2: Define a PyTorch Dataset
class SalaryDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = SalaryDataset(X_train_tensor, y_train_tensor)
test_dataset = SalaryDataset(X_test_tensor, y_test_tensor)

# we define the SalaryPredictor class, a neural network model for salary prediction, using PyTorch's nn.Module. The model includes two hidden layers with ReLU activations and an output layer. 

In [48]:
# Step 3: Create a neural network model
class SalaryPredictor(nn.Module):
    def __init__(self, input_size):
        super(SalaryPredictor, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.output_layer(x)
        return x

model = SalaryPredictor(X_train_tensor.shape[1])

# Step 4: Define a loss function and an optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [49]:
# Step 5: Train the model
def train_model(train_data, model, criterion, optimizer, epochs):
    for epoch in range(epochs):
        for features, labels in train_data:
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# DataLoader for batch processing
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

# Training the model
train_model(train_loader, model, criterion, optimizer, epochs=50)


Epoch [1/50], Loss: 0.5634
Epoch [2/50], Loss: 0.7208
Epoch [3/50], Loss: 0.7309
Epoch [4/50], Loss: 0.6130
Epoch [5/50], Loss: 0.2944
Epoch [6/50], Loss: 0.4063
Epoch [7/50], Loss: 0.4794
Epoch [8/50], Loss: 0.4944
Epoch [9/50], Loss: 0.5899
Epoch [10/50], Loss: 0.3752
Epoch [11/50], Loss: 1.4803
Epoch [12/50], Loss: 0.4032
Epoch [13/50], Loss: 0.8883
Epoch [14/50], Loss: 0.5899
Epoch [15/50], Loss: 0.4567
Epoch [16/50], Loss: 0.6912
Epoch [17/50], Loss: 0.3979
Epoch [18/50], Loss: 0.4326
Epoch [19/50], Loss: 0.3371
Epoch [20/50], Loss: 0.6940
Epoch [21/50], Loss: 0.3203
Epoch [22/50], Loss: 0.3204
Epoch [23/50], Loss: 0.7406
Epoch [24/50], Loss: 0.3173
Epoch [25/50], Loss: 0.3503
Epoch [26/50], Loss: 0.3600
Epoch [27/50], Loss: 0.5237
Epoch [28/50], Loss: 0.5391
Epoch [29/50], Loss: 0.3778
Epoch [30/50], Loss: 0.5364
Epoch [31/50], Loss: 0.5373
Epoch [32/50], Loss: 0.4850
Epoch [33/50], Loss: 0.2247
Epoch [34/50], Loss: 0.4113
Epoch [35/50], Loss: 0.4690
Epoch [36/50], Loss: 0.3204
E

after fininsh training

# An average relative error of 0.11 in our salary prediction model
- This error means that, on average, the predicted salaries are off by 11% from the actual salaries. 
- It seems the prediction was not correct,The gap is too big
- Maybe we can removing some column that can not improve model performance. 

In [50]:
# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    predictions = model(X_test_tensor)

# Inverse transform the predicted and original salary_in_usd
predicted_salaries = y_scaler.inverse_transform(predictions.numpy())
original_salaries = y_scaler.inverse_transform(y_test_tensor.numpy())


# Print the original and predicted salaries
relative_error = []
for i in range(len(predicted_salaries)):
    print(f"Original salary_in_usd: {original_salaries[i][0]:.2f} USD, Predicted salary_in_usd: {predicted_salaries[i][0]:.2f} USD")
    relative_error.append((predicted_salaries[i][0] - original_salaries[i][0]) / original_salaries[i][0])
print(f"Average relative error: {np.mean(relative_error):.2f}")

Original salary_in_usd: 168000.00 USD, Predicted salary_in_usd: 223925.08 USD
Original salary_in_usd: 179975.00 USD, Predicted salary_in_usd: 124826.28 USD
Original salary_in_usd: 144000.00 USD, Predicted salary_in_usd: 158123.00 USD
Original salary_in_usd: 222200.00 USD, Predicted salary_in_usd: 198663.83 USD
Original salary_in_usd: 230000.00 USD, Predicted salary_in_usd: 81259.20 USD
Original salary_in_usd: 40000.00 USD, Predicted salary_in_usd: 49039.64 USD
Original salary_in_usd: 105000.00 USD, Predicted salary_in_usd: 168471.19 USD
Original salary_in_usd: 100000.00 USD, Predicted salary_in_usd: 76063.72 USD
Original salary_in_usd: 29751.00 USD, Predicted salary_in_usd: 6235.91 USD
Original salary_in_usd: 153090.00 USD, Predicted salary_in_usd: 168938.28 USD
Original salary_in_usd: 52533.00 USD, Predicted salary_in_usd: 67064.21 USD
Original salary_in_usd: 115000.00 USD, Predicted salary_in_usd: 157387.39 USD
Original salary_in_usd: 128000.00 USD, Predicted salary_in_usd: 157279.59