In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')



df = pd.read_csv('../data/ds_salaries.csv')

In [2]:
employment_type = 'employment_type'
df[employment_type] = df[employment_type].replace('FT','Full-Time')
df[employment_type] = df[employment_type].replace('CT','Contract')
df[employment_type] = df[employment_type].replace('PT','Part-Time')
df[employment_type] = df[employment_type].replace('FL','Freelance')
df[employment_type].value_counts()

employment_type
Full-Time    3718
Part-Time      17
Contract       10
Freelance      10
Name: count, dtype: int64

In [3]:
experience_level = 'experience_level'
df[experience_level] = df[experience_level].replace('EN','Entry-level/Junior')
df[experience_level] = df[experience_level].replace('MI','Mid-level/Intermediate')
df[experience_level] = df[experience_level].replace('SE','Senior-level/Expert')
df[experience_level] = df[experience_level].replace('EX','Executive-level/Director')
df[experience_level].value_counts()

experience_level
Senior-level/Expert         2516
Mid-level/Intermediate       805
Entry-level/Junior           320
Executive-level/Director     114
Name: count, dtype: int64

In [4]:
print(df.shape)
df.head()

(3755, 11)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,Senior-level/Expert,Full-Time,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,Mid-level/Intermediate,Contract,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,Mid-level/Intermediate,Contract,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,Senior-level/Expert,Full-Time,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,Senior-level/Expert,Full-Time,Data Scientist,120000,USD,120000,CA,100,CA,M


In [5]:
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,Senior-level/Expert,Full-Time,Principal Data Scientist,85847,ES,100,ES,L
1,2023,Mid-level/Intermediate,Contract,ML Engineer,30000,US,100,US,S
2,2023,Mid-level/Intermediate,Contract,ML Engineer,25500,US,100,US,S
3,2023,Senior-level/Expert,Full-Time,Data Scientist,175000,CA,100,CA,M
4,2023,Senior-level/Expert,Full-Time,Data Scientist,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...
3750,2020,Senior-level/Expert,Full-Time,Data Scientist,412000,US,100,US,L
3751,2021,Mid-level/Intermediate,Full-Time,Principal Data Scientist,151000,US,100,US,L
3752,2020,Entry-level/Junior,Full-Time,Data Scientist,105000,US,100,US,S
3753,2020,Entry-level/Junior,Contract,Business Data Analyst,100000,US,100,US,L


In [6]:
print(df['job_title'].value_counts())
# show job titles with less than 10 occurences
df['job_title'].value_counts()[df['job_title'].value_counts() < 40]


job_title
Data Engineer                          1040
Data Scientist                          840
Data Analyst                            612
Machine Learning Engineer               289
Analytics Engineer                      103
                                       ... 
Principal Machine Learning Engineer       1
Azure Data Engineer                       1
Manager Data Management                   1
Marketing Data Engineer                   1
Finance Data Analyst                      1
Name: count, Length: 93, dtype: int64


job_title
Research Engineer                      37
ML Engineer                            34
Data Manager                           29
Machine Learning Scientist             26
Data Science Consultant                24
                                       ..
Principal Machine Learning Engineer     1
Azure Data Engineer                     1
Manager Data Management                 1
Marketing Data Engineer                 1
Finance Data Analyst                    1
Name: count, Length: 84, dtype: int64

### 分類

In [17]:
job_title_mapping = {
    # Data Engineer 分類
    'Big Data Engineer': 'Data Engineer',
    'Data Infrastructure Engineer': 'Data Engineer',
    'Lead Data Engineer': 'Data Engineer',
    'Cloud Database Engineer': 'Data Engineer',
    'ETL Developer': 'Data Engineer',
    'BI Developer': 'Data Engineer',
    'Data Operations Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer',
    'Azure Data Engineer': 'Data Engineer',
    'Data DevOps Engineer': 'Data Engineer',
    'BI Data Engineer': 'Data Engineer',

    # Data Scientist 分類
    'Machine Learning Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist',
    'AI Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist',
    'Principal Data Scientist': 'Data Scientist',
    'Data Scientist Lead': 'Data Scientist',
    'Product Data Scientist': 'Data Scientist',
    'Compliance Data Analyst': 'Data Scientist', # 注意：這個職位可能更接近 Data Analyst
    'Data Science Tech Lead': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist',

    # Data Analyst 分類
    'Data Manager': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst',
    'Business Data Analyst': 'Data Analyst',
    'Data Specialist': 'Data Analyst',
    'BI Data Analyst': 'Data Analyst',
    'Data Quality Analyst': 'Data Analyst',
    'Product Data Analyst': 'Data Analyst',
    'Financial Data Analyst': 'Data Analyst',
    'Marketing Data Analyst': 'Data Analyst',
    'Data Modeler': 'Data Analyst',
    'Principal Data Analyst': 'Data Analyst',
    'Insight Analyst': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    'Manager Data Management': 'Data Analyst',
    'Finance Data Analyst': 'Data Analyst',

    # Machine Learning Engineer 分類
    'Machine Learning Infrastructure Engineer': 'Machine Learning Engineer',
    'Machine Learning Software Engineer': 'Machine Learning Engineer',
    'Machine Learning Developer': 'Machine Learning Engineer',
    'Deep Learning Engineer': 'Machine Learning Engineer',
    'Machine Learning Research Engineer': 'Machine Learning Engineer',
    'Lead Machine Learning Engineer': 'Machine Learning Engineer',
    'Applied Machine Learning Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer',

    # Analytics Engineer 分類
    'Data Analytics Engineer': 'Analytics Engineer',
    'Lead Data Analyst': 'Analytics Engineer', # 注意：這個職位可能更接近 Data Analyst
    'Data Analytics Consultant': 'Analytics Engineer',
    'Data Analytics Lead': 'Analytics Engineer',

    # Data Architect 分類
    'Big Data Architect': 'Data Architect',
    'Principal Data Architect': 'Data Architect',
    'Cloud Data Architect': 'Data Architect',

    # Research Scientist 分類
    'Machine Learning Researcher': 'Research Scientist',
    '3D Computer Vision Researcher': 'Research Scientist',
    'Deep Learning Researcher': 'Research Scientist',

    # Data Science Manager 分類
    'Director of Data Science': 'Data Science Manager',
    'Head of Data': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager',
    'Machine Learning Manager': 'Data Science Manager',
    'Head of Machine Learning': 'Data Science Manager',

    # Applied Scientist 分類
    'Applied Machine Learning Scientist': 'Applied Scientist',
    'Computer Vision Engineer': 'Applied Scientist',
    'AI Developer': 'Applied Scientist',
    'NLP Engineer': 'Applied Scientist',
    'Computer Vision Software Engineer': 'Applied Scientist',
    'Autonomous Vehicle Technician': 'Applied Scientist',

    # Research Engineer 分類
    'MLOps Engineer': 'Research Engineer',
    'Business Intelligence Engineer': 'Research Engineer',
    'Data Operations Analyst': 'Research Engineer',
    'Cloud Data Engineer': 'Research Engineer',
    'AI Programmer': 'Research Engineer',
    'Power BI Developer': 'Research Engineer',
    'Data Management Specialist': 'Research Engineer',
    'Marketing Data Engineer': 'Research Engineer',
    'Data Strategist': 'Research Engineer',


    'BI Analyst': 'Data Analyst',
    'Lead Data Scientist': 'Data Science Manager',
    'Data Science Lead': 'Data Science Manager',
    'Data Science Engineer': 'Data Scientist',
    'Data Lead': 'Data Science Manager', # 或 'Data Engineer' 取決於具體職責
    'Principal Data Engineer': 'Data Engineer',
    'Software Data Engineer': 'Data Engineer',
    'ETL Engineer': 'Data Engineer',
    'Staff Data Analyst': 'Data Analyst'
}


In [18]:
df['job_title'] = df['job_title'].replace(job_title_mapping)

In [19]:
print(df['job_title'].value_counts())

job_title
Data Engineer                1110
Data Scientist                935
Data Analyst                  744
Machine Learning Engineer     333
Analytics Engineer            118
Applied Scientist             113
Data Science Manager          111
Data Architect                105
Research Scientist             93
Research Engineer              59
ML Engineer                    34
Name: count, dtype: int64


In [24]:
# save csv
df.to_csv('../data/ds_salaries_cleaned.csv', index = False)

### 計算平均

In [12]:
#print all job titles with less than 10 occurences
print(df['job_title'].value_counts()[df['job_title'].value_counts() < 37].index.tolist())

['ML Engineer', 'Data Manager', 'Machine Learning Scientist', 'Data Science Consultant', 'Data Analytics Manager', 'Computer Vision Engineer', 'AI Scientist', 'BI Data Analyst', 'Business Data Analyst', 'Data Specialist', 'BI Developer', 'Applied Machine Learning Scientist', 'Machine Learning Infrastructure Engineer', 'Big Data Engineer', 'Director of Data Science', 'AI Developer', 'Applied Data Scientist', 'Head of Data', 'Machine Learning Software Engineer', 'Data Operations Engineer', 'ETL Developer', 'BI Analyst', 'Head of Data Science', 'Lead Data Scientist', 'Data Science Lead', 'Principal Data Scientist', 'Data Quality Analyst', 'NLP Engineer', 'Machine Learning Developer', 'Data Infrastructure Engineer', 'Lead Data Engineer', 'Machine Learning Researcher', 'Deep Learning Engineer', 'Data Analytics Engineer', 'Lead Data Analyst', 'Cloud Database Engineer', 'Computer Vision Software Engineer', 'Product Data Analyst', 'Data Science Engineer', 'MLOps Engineer', '3D Computer Vision 

In [13]:
print(df['job_title'].value_counts()[df['job_title'].value_counts() >= 37].index.tolist())

['Data Engineer', 'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Analytics Engineer', 'Data Architect', 'Research Scientist', 'Data Science Manager', 'Applied Scientist', 'Research Engineer']


In [11]:
top15_job_titles = df['job_title'].value_counts()[:10]
fig = px.bar(y = top15_job_titles.values, x = top15_job_titles.index, 
            text = top15_job_titles.values, title = 'Top 10 Job Designations')
fig.update_layout(xaxis_title = "Job Designations", yaxis_title = "Count")
fig.show()

In [20]:

# Step 1: Preprocess the data
X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']

# One-hot encode categorical columns 類別型變數轉換成數值型態
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
onehotencoder = OneHotEncoder()
X_categorical = onehotencoder.fit_transform(X[categorical_columns]).toarray()

# Standardize numerical columns 用來標準化數值型數據
numerical_columns = ['work_year', 'remote_ratio'] 
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X[numerical_columns])

# Concatenate the numerical and categorical features
X_processed = np.concatenate((X_numerical, X_categorical), axis=1)

# Convert target to a suitable format
y = y.values.reshape(-1, 1)
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_scaled, test_size=0.2, random_state=42)

# Convert arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Step 2: Define a PyTorch Dataset
class SalaryDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = SalaryDataset(X_train_tensor, y_train_tensor)
test_dataset = SalaryDataset(X_test_tensor, y_test_tensor)

In [21]:
# Step 3: Create a neural network model
class SalaryPredictor(nn.Module):
    def __init__(self, input_size):
        super(SalaryPredictor, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.output_layer(x)
        return x

model = SalaryPredictor(X_train_tensor.shape[1])

# Step 4: Define a loss function and an optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [22]:
# Step 5: Train the model
def train_model(train_data, model, criterion, optimizer, epochs):
    for epoch in range(epochs):
        for features, labels in train_data:
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# DataLoader for batch processing
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

# Training the model
train_model(train_loader, model, criterion, optimizer, epochs=50)


Epoch [1/50], Loss: 0.4989
Epoch [2/50], Loss: 0.5568
Epoch [3/50], Loss: 0.4171
Epoch [4/50], Loss: 0.4709
Epoch [5/50], Loss: 0.6691
Epoch [6/50], Loss: 0.5950
Epoch [7/50], Loss: 0.5135
Epoch [8/50], Loss: 0.2376
Epoch [9/50], Loss: 0.4999
Epoch [10/50], Loss: 0.4970
Epoch [11/50], Loss: 0.5947
Epoch [12/50], Loss: 0.4799
Epoch [13/50], Loss: 0.7005
Epoch [14/50], Loss: 0.5853
Epoch [15/50], Loss: 0.7299
Epoch [16/50], Loss: 0.4694
Epoch [17/50], Loss: 0.7201
Epoch [18/50], Loss: 0.4582
Epoch [19/50], Loss: 0.3875
Epoch [20/50], Loss: 0.2810
Epoch [21/50], Loss: 0.3992
Epoch [22/50], Loss: 0.3156
Epoch [23/50], Loss: 0.5482
Epoch [24/50], Loss: 0.3683
Epoch [25/50], Loss: 0.4996
Epoch [26/50], Loss: 0.5201
Epoch [27/50], Loss: 0.3451
Epoch [28/50], Loss: 0.6824
Epoch [29/50], Loss: 1.0285
Epoch [30/50], Loss: 0.4891
Epoch [31/50], Loss: 0.5525
Epoch [32/50], Loss: 0.3341
Epoch [33/50], Loss: 0.8484
Epoch [34/50], Loss: 0.5704
Epoch [35/50], Loss: 0.3897
Epoch [36/50], Loss: 0.6722
E

In [23]:
# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    predictions = model(X_test_tensor)

# Inverse transform the predicted and original salary_in_usd
predicted_salaries = y_scaler.inverse_transform(predictions.numpy())
original_salaries = y_scaler.inverse_transform(y_test_tensor.numpy())


# Print the original and predicted salaries
relative_error = []
for i in range(len(predicted_salaries)):
    print(f"Original salary_in_usd: {original_salaries[i][0]:.2f} USD, Predicted salary_in_usd: {predicted_salaries[i][0]:.2f} USD")
    relative_error.append((predicted_salaries[i][0] - original_salaries[i][0]) / original_salaries[i][0])
print(f"Average relative error: {np.mean(relative_error):.2f}")

Original salary_in_usd: 168000.00 USD, Predicted salary_in_usd: 172676.70 USD
Original salary_in_usd: 179975.00 USD, Predicted salary_in_usd: 124554.27 USD
Original salary_in_usd: 144000.00 USD, Predicted salary_in_usd: 157604.16 USD
Original salary_in_usd: 222200.00 USD, Predicted salary_in_usd: 213827.14 USD
Original salary_in_usd: 230000.00 USD, Predicted salary_in_usd: 129217.16 USD
Original salary_in_usd: 40000.00 USD, Predicted salary_in_usd: 61261.36 USD
Original salary_in_usd: 105000.00 USD, Predicted salary_in_usd: 174761.61 USD
Original salary_in_usd: 100000.00 USD, Predicted salary_in_usd: 80809.13 USD
Original salary_in_usd: 29751.00 USD, Predicted salary_in_usd: 52685.80 USD
Original salary_in_usd: 153090.00 USD, Predicted salary_in_usd: 176246.92 USD
Original salary_in_usd: 52533.00 USD, Predicted salary_in_usd: 58045.04 USD
Original salary_in_usd: 115000.00 USD, Predicted salary_in_usd: 149964.98 USD
Original salary_in_usd: 128000.00 USD, Predicted salary_in_usd: 163594.