In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np

df = pd.read_csv('../data/ds_salaries.csv')

In [33]:
employment_type = 'employment_type'
df[employment_type] = df[employment_type].replace('FT','Full-Time')
df[employment_type] = df[employment_type].replace('CT','Contract')
df[employment_type] = df[employment_type].replace('PT','Part-Time')
df[employment_type] = df[employment_type].replace('FL','Freelance')
df[employment_type].value_counts()

employment_type
Full-Time    3718
Part-Time      17
Contract       10
Freelance      10
Name: count, dtype: int64

In [34]:
experience_level = 'experience_level'
df[experience_level] = df[experience_level].replace('EN','Entry-level/Junior')
df[experience_level] = df[experience_level].replace('MI','Mid-level/Intermediate')
df[experience_level] = df[experience_level].replace('SE','Senior-level/Expert')
df[experience_level] = df[experience_level].replace('EX','Executive-level/Director')
df[experience_level].value_counts()

experience_level
Senior-level/Expert         2516
Mid-level/Intermediate       805
Entry-level/Junior           320
Executive-level/Director     114
Name: count, dtype: int64

In [35]:
print(df.shape)
df.head()

(3755, 11)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,Senior-level/Expert,Full-Time,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,Mid-level/Intermediate,Contract,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,Mid-level/Intermediate,Contract,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,Senior-level/Expert,Full-Time,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,Senior-level/Expert,Full-Time,Data Scientist,120000,USD,120000,CA,100,CA,M


In [36]:
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,Senior-level/Expert,Full-Time,Principal Data Scientist,85847,ES,100,ES,L
1,2023,Mid-level/Intermediate,Contract,ML Engineer,30000,US,100,US,S
2,2023,Mid-level/Intermediate,Contract,ML Engineer,25500,US,100,US,S
3,2023,Senior-level/Expert,Full-Time,Data Scientist,175000,CA,100,CA,M
4,2023,Senior-level/Expert,Full-Time,Data Scientist,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...
3750,2020,Senior-level/Expert,Full-Time,Data Scientist,412000,US,100,US,L
3751,2021,Mid-level/Intermediate,Full-Time,Principal Data Scientist,151000,US,100,US,L
3752,2020,Entry-level/Junior,Full-Time,Data Scientist,105000,US,100,US,S
3753,2020,Entry-level/Junior,Contract,Business Data Analyst,100000,US,100,US,L


In [37]:

# Step 1: Preprocess the data
X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']

# One-hot encode categorical columns
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
onehotencoder = OneHotEncoder()
X_categorical = onehotencoder.fit_transform(X[categorical_columns]).toarray()

# Standardize numerical columns
numerical_columns = ['work_year', 'remote_ratio'] 
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X[numerical_columns])

# Concatenate the numerical and categorical features
X_processed = np.concatenate((X_numerical, X_categorical), axis=1)

# Convert target to a suitable format
y = y.values.reshape(-1, 1)
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_scaled, test_size=0.2, random_state=42)

# Convert arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Step 2: Define a PyTorch Dataset
class SalaryDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = SalaryDataset(X_train_tensor, y_train_tensor)
test_dataset = SalaryDataset(X_test_tensor, y_test_tensor)

In [38]:
# Step 3: Create a neural network model
class SalaryPredictor(nn.Module):
    def __init__(self, input_size):
        super(SalaryPredictor, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.output_layer(x)
        return x

model = SalaryPredictor(X_train_tensor.shape[1])

# Step 4: Define a loss function and an optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [45]:
# Step 5: Train the model
def train_model(train_data, model, criterion, optimizer, epochs):
    for epoch in range(epochs):
        for features, labels in train_data:
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# DataLoader for batch processing
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

# Training the model
train_model(train_loader, model, criterion, optimizer, epochs=10)


Epoch [1/10], Loss: 0.6461
Epoch [2/10], Loss: 0.4609
Epoch [3/10], Loss: 0.2265
Epoch [4/10], Loss: 0.4111
Epoch [5/10], Loss: 0.8677
Epoch [6/10], Loss: 0.2325
Epoch [7/10], Loss: 0.6289
Epoch [8/10], Loss: 0.5756
Epoch [9/10], Loss: 0.3795
Epoch [10/10], Loss: 0.2738


In [46]:
# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    predictions = model(X_test_tensor)

# Inverse transform the predicted and original salary_in_usd
predicted_salaries = y_scaler.inverse_transform(predictions.numpy())
original_salaries = y_scaler.inverse_transform(y_test_tensor.numpy())


# Print the original and predicted salaries
relative_error = []
for i in range(len(predicted_salaries)):
    print(f"Original salary_in_usd: {original_salaries[i][0]:.2f} USD, Predicted salary_in_usd: {predicted_salaries[i][0]:.2f} USD")
    relative_error.append((predicted_salaries[i][0] - original_salaries[i][0]) / original_salaries[i][0])
print(f"Average relative error: {np.mean(relative_error):.2f}")

Original salary_in_usd: 168000.00 USD, Predicted salary_in_usd: 214730.17 USD
Original salary_in_usd: 179975.00 USD, Predicted salary_in_usd: 127057.99 USD
Original salary_in_usd: 144000.00 USD, Predicted salary_in_usd: 156137.16 USD
Original salary_in_usd: 222200.00 USD, Predicted salary_in_usd: 195168.80 USD
Original salary_in_usd: 230000.00 USD, Predicted salary_in_usd: 128584.20 USD
Original salary_in_usd: 40000.00 USD, Predicted salary_in_usd: 61115.88 USD
Original salary_in_usd: 105000.00 USD, Predicted salary_in_usd: 163077.17 USD
Original salary_in_usd: 100000.00 USD, Predicted salary_in_usd: 79343.30 USD
Original salary_in_usd: 29751.00 USD, Predicted salary_in_usd: 12091.81 USD
Original salary_in_usd: 153090.00 USD, Predicted salary_in_usd: 172035.61 USD
Original salary_in_usd: 52533.00 USD, Predicted salary_in_usd: 63617.73 USD
Original salary_in_usd: 115000.00 USD, Predicted salary_in_usd: 154384.05 USD
Original salary_in_usd: 128000.00 USD, Predicted salary_in_usd: 152403.