# EmployeePayHistory

## Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from torch.utils.data import TensorDataset, DataLoader
from common_functions import drop_sk_datetime_added_columns as drop_columns, train_model, get_engine, \
    read_data_return_df, plot_predictions, plot_feature_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
import torch

## Connecting to the database
Connecting to the UnitedOutdoors sql server database and reading the data from the EmployeePayHistory table

In [None]:
engine = get_engine()

## Reading in the data

In [None]:
sql_query_employeepayhistory = "SELECT * FROM EmployeePayHistory"
employeepayhistory_df = read_data_return_df(sql_query_employeepayhistory, engine)

employeepayhistory_df.head()

In [None]:
sql_query_businessentity = "SELECT * FROM BusinessEntity"
businessentity_df = read_data_return_df(sql_query_businessentity, engine)

businessentity_df.head()

In [None]:
sql_query_employee = "SELECT EMPLOYEE_EMPLOYEE_EmployeeID, EMPLOYEE_EMPLOYEE_ManagerID, EMPLOYEE_EMPLOYEE_DeptID, EMPLOYEE_EMPLOYEE_State, EMPLOYEE_EMPLOYEE_Start_Date, EMPLOYEE_EMPLOYEE_OrganizationLevel, EMPLOYEE_EMPLOYEE_BirthDate, EMPLOYEE_EMPLOYEE_SickLeaveHours FROM Employee"
employee_df = read_data_return_df(sql_query_employee, engine)

employee_df.head()

## Data Cleaning

In [None]:
# combining the dataframes
combined_df = pd.merge(employeepayhistory_df ,businessentity_df, left_on='EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_BusinessEntityID', right_on='BUSINESSENTITY_sk', suffixes=('_eph', '_b'))

combined_df = pd.merge(combined_df, employee_df, left_on='BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID', right_on='EMPLOYEE_EMPLOYEE_EmployeeID', suffixes=('', '_e'), how='outer')

In [None]:
# dropping unnecessary columns
columns_to_keep = ['EMPLOYEE_EMPLOYEE_EmployeeID', 'EMPLOYEE_EMPLOYEE_ManagerID', 'EMPLOYEE_EMPLOYEE_DeptID', 'EMPLOYEE_EMPLOYEE_State', 'EMPLOYEE_EMPLOYEE_BirthDate', 'EMPLOYEE_EMPLOYEE_Start_Date', 'EMPLOYEE_EMPLOYEE_OrganizationLevel', 'EMPLOYEE_EMPLOYEE_SickLeaveHours', 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate', 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_PayFrequency', 'BUSINESSENTITY_CONTACTTYPE_ContactTypeID', 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate']

combined_df = combined_df[columns_to_keep]

In [None]:
# converting EMPLOYEE_EMPLOYEE_State
combined_df['EMPLOYEE_EMPLOYEE_State'] = combined_df['EMPLOYEE_EMPLOYEE_State'].astype('category').cat.codes

# Convert the datetime column to number of days since Unix epoch
combined_df['EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate'] = pd.to_datetime(combined_df['EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate'], errors='coerce')
combined_df['EMPLOYEE_EMPLOYEE_BirthDate'] = pd.to_datetime(combined_df['EMPLOYEE_EMPLOYEE_BirthDate'], errors='coerce')
combined_df['EMPLOYEE_EMPLOYEE_Start_Date'] = pd.to_datetime(combined_df['EMPLOYEE_EMPLOYEE_Start_Date'], errors='coerce')

# Convert the datetime objects to number of days since Unix epoch
combined_df['EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate'] = (combined_df['EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')
combined_df['EMPLOYEE_EMPLOYEE_BirthDate'] = (combined_df['EMPLOYEE_EMPLOYEE_BirthDate'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')
combined_df['EMPLOYEE_EMPLOYEE_Start_Date'] = (combined_df['EMPLOYEE_EMPLOYEE_Start_Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

In [None]:
# fixing null values
drop_columns(combined_df)

# dropping all rows where EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate is null/-1
combined_df = combined_df[combined_df['EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate'] != -1]

In [None]:
combined_df.info()

## Data Analysis
Predicting the rate of pay for employees based on the other columns, using a variety of models

### Splitting the data

In [None]:
# splitting the data
X = combined_df.drop(columns=['EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate'])
y = combined_df['EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [None]:
# scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# fitting the model
lr_model = LinearRegression()

lr_model.fit(X_train_scaled, y_train)

In [None]:
# predicting the values
lr_y_pred = lr_model.predict(X_test_scaled)

#### Calculating the metrics and visualizing the results

In [None]:
# calculating the metrics
lr_mse = mean_squared_error(y_test, lr_y_pred)
lr_r2 = r2_score(y_test, lr_y_pred)

print(f'Mean Squared Error: {lr_mse}')
print(f'R2 Score: {lr_r2}')

In [None]:
# visualizing the results
plot_predictions(y_test, lr_y_pred, 'Linear Regression')

In [None]:
# dataframe with the predictions and the actual values, plus other metrics
lr_results = pd.DataFrame({'Actual': y_test, 'Predicted': lr_y_pred})
lr_results['Difference'] = lr_results['Actual'] - lr_results['Predicted']
lr_results['Absolute Difference'] = np.abs(lr_results['Difference'])
lr_results['Squared Difference'] = lr_results['Difference'] ** 2

lr_results

In [None]:
# plotting the feature importances, which means how much each column contributes to the prediction
plot_feature_importance(X.columns, lr_model.coef_)

### Decision Tree Regressor

In [None]:
# fitting the model
dtr_model = DecisionTreeRegressor(max_depth=5)

dtr_model.fit(X_train_scaled, y_train)

In [None]:
# predicting the values
dtr_y_pred = dtr_model.predict(X_test_scaled)

#### Calculating the metrics and visualizing the results

In [None]:
# calculating the metrics
dtr_mse = mean_squared_error(y_test, dtr_y_pred)
dtr_r2 = r2_score(y_test, dtr_y_pred)
print(f'Mean Squared Error: {dtr_mse}')
print(f'R2 Score: {dtr_r2}')

In [None]:
# visualizing the decision tree
plt.figure(figsize=(50, 50))
plot_tree(dtr_model, filled=True, feature_names=X.columns)
plt.show()

In [None]:
# dataframe with the predictions and the actual values, plus other metrics
dtr_results = pd.DataFrame({'Actual': y_test, 'Predicted': dtr_y_pred})
dtr_results['Difference'] = dtr_results['Actual'] - dtr_results['Predicted']
dtr_results['Absolute Difference'] = np.abs(dtr_results['Difference'])
dtr_results['Squared Difference'] = dtr_results['Difference'] ** 2

dtr_results

In [None]:
# plotting the feature importances, which means how much each column contributes to the prediction
plot_feature_importance(X.columns, dtr_model.feature_importances_)

### Random Forest

In [None]:
# fitting the model
rf_model = RandomForestRegressor(n_estimators=100)

rf_model.fit(X_train_scaled, y_train)

In [None]:
# predicting the values
rf_y_pred = rf_model.predict(X_test_scaled)

#### Calculating the metrics and visualizing the results

In [None]:
# calculating the metrics
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)
print(f'Mean Squared Error: {rf_mse}')
print(f'R2 Score: {rf_r2}')

In [None]:
# plotting the results
plot_predictions(y_test, rf_y_pred, 'Random Forest')

In [None]:
# dataframe with the predictions and the actual values, plus other metrics
rf_results = pd.DataFrame({'Actual': y_test, 'Predicted': rf_y_pred})
rf_results['Difference'] = rf_results['Actual'] - rf_results['Predicted']
rf_results['Absolute Difference'] = np.abs(rf_results['Difference'])
rf_results['Squared Difference'] = rf_results['Difference'] ** 2

rf_results

In [None]:
# plotting the feature importances, which means how much each column contributes to the predictions
plot_feature_importance(X.columns, rf_model.feature_importances_)

### Gradient Boosting

In [None]:
# fitting the model
gb_model = GradientBoostingRegressor(n_estimators=100)

gb_model.fit(X_train_scaled, y_train)

In [None]:
# predicting the values
gb_y_pred = gb_model.predict(X_test_scaled)

#### Calculating the metrics and visualizing the results

In [None]:
# calculating the metrics
gb_mse = mean_squared_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)
print(f'Mean Squared Error: {gb_mse}')
print(f'R2 Score: {gb_r2}')

In [None]:
# plotting the results
plot_predictions(y_test, gb_y_pred, 'Gradient Boosting')

In [None]:
# dataframe with the predictions and the actual values, plus other metrics
gb_results = pd.DataFrame({'Actual': y_test, 'Predicted': gb_y_pred})
gb_results['Difference'] = gb_results['Actual'] - gb_results['Predicted']
gb_results['Absolute Difference'] = np.abs(gb_results['Difference'])
gb_results['Squared Difference'] = gb_results['Difference'] ** 2

gb_results

In [None]:
# plotting the feature importances, which means how much each column contributes to the predictions
plot_feature_importance(X.columns, gb_model.feature_importances_)

### Pytorch Neural Network

In [None]:
# check if the GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# converting the scaled data to tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)  # target variable often doesn't need scaling
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

In [None]:
# instantiate the model
p_model = torch.nn.Sequential(
    torch.nn.Linear(X_train_tensor.shape[1], 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, 1)
).to(device)

# define the loss function and the optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(p_model.parameters(), lr=0.01)

# Create TensorDatasets for training and test data
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders for training and test data
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

# Train the model
train_model(p_model, criterion, optimizer, train_loader, num_epochs=1000)

#### Calculating the metrics and visualizing the results

In [None]:
# Evaluate the model
p_model.eval()
p_mse_list = []
p_r2_list = []

with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = p_model(inputs)
        mse = mean_squared_error(targets.cpu().detach().numpy(), outputs.cpu().detach().numpy())
        p_mse_list.append(mse)
        
        # Only calculate R^2 score if there are more than one samples
        if len(targets) > 1:
            r2 = r2_score(targets.cpu().detach().numpy(), outputs.cpu().detach().numpy())
            p_r2_list.append(r2)

# Calculate the average metrics
p_avg_mse = np.mean(p_mse_list)
p_avg_r2 = np.mean(p_r2_list) if p_r2_list else None

print(f'Average Mean Squared Error: {p_avg_mse}')
print(f'Average R2 Score: {p_avg_r2}')

In [None]:
# dataframe with the predictions and the actual values, plus other metrics
p_results = pd.DataFrame({'Actual': y_test_tensor.cpu().detach().numpy(), 'Predicted': p_model(X_test_tensor).cpu().detach().numpy().flatten()})
p_results['Difference'] = p_results['Actual'] - p_results['Predicted']
p_results['Absolute Difference'] = np.abs(p_results['Difference'])
p_results['Squared Difference'] = p_results['Difference'] ** 2

p_results

In [None]:
# plotting the feature results
plot_predictions(y_test_tensor.cpu().detach().numpy(), p_model(X_test_tensor).cpu().detach().numpy().flatten(), 'Pytorch Network')