In [1]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier,GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.regressor import PredictionError
sns.set(color_codes=True)

%matplotlib inline

In [2]:
# Load the dataset
df = pd.read_csv('DAEN690_Final_Cyber_Exodus_Dataset.csv')

In [3]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,Title,Job_location,Salary_low,Salary_med,Salary_upper,Remote,Hybrid,On_prem,Source,Work_env,...,Mid,Senior,Executive,Experience_level,Salary_s,Commute_S,Per_Crime_S,Prop_Crime_S,Stress,Candidate
0,Information Security Analyst,"Washington, DC",55000.0,67500.0,80000,1,0,0,Indeed,1,...,0,0,0,1,0,0,0,0,0,0
1,Security Systems Analyst | REMOTE,"Washington, DC",84500.0,95750.0,107000,1,0,0,Indeed,1,...,1,0,0,2,1,0,1,1,3,1
2,Intermediate Information Security Analyst,"Washington, DC",75000.0,82500.0,90000,1,0,0,Indeed,1,...,0,0,0,1,0,0,1,0,1,0
3,Cybersecurity Analyst,"Washington, DC",55700.0,63150.0,70600,1,0,0,Indeed,1,...,0,0,0,1,1,0,0,0,1,0
4,Cloud Security Architect,"Washington, DC",120000.0,140000.0,160000,1,0,0,Indeed,1,...,0,1,0,3,1,0,0,0,1,0


In [4]:
# Get an overview of the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Title             1168 non-null   object 
 1   Job_location      1168 non-null   object 
 2   Salary_low        1168 non-null   float64
 3   Salary_med        1168 non-null   float64
 4   Salary_upper      1168 non-null   int64  
 5   Remote            1168 non-null   int64  
 6   Hybrid            1168 non-null   int64  
 7   On_prem           1168 non-null   int64  
 8   Source            1168 non-null   object 
 9   Work_env          1168 non-null   int64  
 10  Buy_rent          1168 non-null   int64  
 11  Home_location     1168 non-null   object 
 12  Commute           1168 non-null   int64  
 13  Weekly_commute    1168 non-null   int64  
 14  Person_crime      1168 non-null   int64  
 15  Prop_crime        1168 non-null   int64  
 16  Entry             1168 non-null   int64  


In [5]:
# Summary statistics
df.describe()

Unnamed: 0,Salary_low,Salary_med,Salary_upper,Remote,Hybrid,On_prem,Work_env,Buy_rent,Commute,Weekly_commute,...,Mid,Senior,Executive,Experience_level,Salary_s,Commute_S,Per_Crime_S,Prop_Crime_S,Stress,Candidate
count,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,...,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,99916.961815,118909.18339,137901.404966,0.196062,0.083904,0.720034,2.523973,0.373288,87.234589,436.172945,...,0.432363,0.351884,0.05137,2.29024,0.471747,0.559932,0.454623,0.074486,1.560788,0.214897
std,34737.023354,37136.854214,43150.62266,0.397186,0.277363,0.449174,0.80131,0.483885,69.384562,346.922809,...,0.495616,0.477762,0.220846,0.798783,0.499415,0.496608,0.49815,0.262673,1.041423,0.410927
min,43000.0,56000.0,60000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,72000.0,92000.0,112000.0,0.0,0.0,0.0,2.0,0.0,14.0,70.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,94000.0,110000.0,132000.0,0.0,0.0,1.0,3.0,0.0,105.0,525.0,...,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0
75%,124800.0,150000.0,160000.0,0.0,0.0,1.0,3.0,1.0,128.0,640.0,...,1.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,2.0,0.0
max,250000.0,250000.0,260000.0,1.0,1.0,1.0,3.0,1.0,225.0,1125.0,...,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,1.0


In [6]:
# DATA PREPROCESSING
# Check for missing values
print(df.isnull().sum())

# Since the dataset is already clean and there are no missing values, no further preprocessing is needed for this step.
# However, we may need to encode categorical variables for certain machine learning algorithms.

Title               0
Job_location        0
Salary_low          0
Salary_med          0
Salary_upper        0
Remote              0
Hybrid              0
On_prem             0
Source              0
Work_env            0
Buy_rent            0
Home_location       0
Commute             0
Weekly_commute      0
Person_crime        0
Prop_crime          0
Entry               0
Mid                 0
Senior              0
Executive           0
Experience_level    0
Salary_s            0
Commute_S           0
Per_Crime_S         0
Prop_Crime_S        0
Stress              0
Candidate           0
dtype: int64


# # Predicting the Stress Candidate

In [7]:
# Assuming the target variable is 'Candidate' (1 for Yes, 0 for No)
X_leave_job = df[['Salary_med','Experience_level', 'Work_env', 'Weekly_commute', 'Person_crime', 'Prop_crime']]
y_leave_job = df['Candidate']

# Handling categorical features (Job_location) using Label Encoding
#X_leave_job['Job_location'] = le.fit_transform(X_leave_job['Job_location'])

# Splitting the data into training and testing sets
X_train_leave_job, X_test_leave_job, y_train_leave_job, y_test_leave_job = train_test_split(X_leave_job, y_leave_job, test_size=0.2, random_state=42)

# Scaling the numerical features
scaler_leave_job = StandardScaler()
X_train_scaled_leave_job = scaler_leave_job.fit_transform(X_train_leave_job)
X_test_scaled_leave_job = scaler_leave_job.transform(X_test_leave_job)

# Initializing classification models
classification_models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()

}

# Training and evaluating models
results_leave_job = {}
for model_name, model in tqdm(classification_models.items(), desc='Training \'At Risk of leaving\' Models'):
    model.fit(X_train_scaled_leave_job, y_train_leave_job)
    y_pred_leave_job = model.predict(X_test_scaled_leave_job)
    accuracy_leave_job = accuracy_score(y_test_leave_job, y_pred_leave_job)
    results_leave_job[model_name] = accuracy_leave_job
    print(f'\n{model_name} Accuracy: {accuracy_leave_job:.2f}')



# Identifying the best model based on accuracy
best_model_leave_job = max(results_leave_job, key=results_leave_job.get)
print("Best Model for predicting the Risk of Individual Leaving the Job:", best_model_leave_job)


Training 'At Risk of leaving' Models:   0%|                                                      | 0/5 [00:00<?, ?it/s]


Logistic Regression Accuracy: 0.97


Training 'At Risk of leaving' Models:  40%|██████████████████▍                           | 2/5 [00:00<00:00,  6.56it/s]


Random Forest Accuracy: 1.00

SVM Accuracy: 0.97

KNN Accuracy: 0.99


Training 'At Risk of leaving' Models: 100%|██████████████████████████████████████████████| 5/5 [00:00<00:00,  8.06it/s]


Gradient Boosting Accuracy: 1.00
Best Model for predicting the Risk of Individual Leaving the Job: Random Forest





# # Predicting the Stress Candidate without salary

In [8]:
# Assuming the target variable is 'Candidate' (1 for Yes, 0 for No)
X_leave_job = df[['Experience_level', 'Work_env', 'Weekly_commute', 'Person_crime', 'Prop_crime']]
y_leave_job = df['Candidate']

# Handling categorical features (Job_location) using Label Encoding
#X_leave_job['Job_location'] = le.fit_transform(X_leave_job['Job_location'])

# Splitting the data into training and testing sets
X_train_leave_job, X_test_leave_job, y_train_leave_job, y_test_leave_job = train_test_split(X_leave_job, y_leave_job, test_size=0.2, random_state=42)

# Scaling the numerical features
scaler_leave_job = StandardScaler()
X_train_scaled_leave_job = scaler_leave_job.fit_transform(X_train_leave_job)
X_test_scaled_leave_job = scaler_leave_job.transform(X_test_leave_job)

# Initializing classification models
classification_models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()

}

# Training and evaluating models
results_leave_job = {}
for model_name, model in tqdm(classification_models.items(), desc='Training \'At Risk of leaving\' Models'):
    model.fit(X_train_scaled_leave_job, y_train_leave_job)
    y_pred_leave_job = model.predict(X_test_scaled_leave_job)
    accuracy_leave_job = accuracy_score(y_test_leave_job, y_pred_leave_job)
    results_leave_job[model_name] = accuracy_leave_job
    print(f'\n{model_name} Accuracy: {accuracy_leave_job:.2f}')



# Identifying the best model based on accuracy
best_model_leave_job = max(results_leave_job, key=results_leave_job.get)
print("Best Model for predicting the Risk of Individual Leaving the Job:", best_model_leave_job)


Training 'At Risk of leaving' Models:   0%|                                                      | 0/5 [00:00<?, ?it/s]


Logistic Regression Accuracy: 0.97


Training 'At Risk of leaving' Models:  40%|██████████████████▍                           | 2/5 [00:00<00:00,  7.28it/s]


Random Forest Accuracy: 0.97

SVM Accuracy: 0.94

KNN Accuracy: 0.96


Training 'At Risk of leaving' Models: 100%|██████████████████████████████████████████████| 5/5 [00:00<00:00,  9.10it/s]


Gradient Boosting Accuracy: 0.97
Best Model for predicting the Risk of Individual Leaving the Job: Random Forest





## Predicting  Median Salary

In [9]:
pd.set_option('mode.chained_assignment', None)

# Assuming the target variable is 'Salary_med'
X_salary = df[['Work_env', 'Job_location', 'Weekly_commute']]
y_salary = df['Salary_med']

le = LabelEncoder()

# Handling categorical features (Job_location) using Label Encoding
X_salary['Job_location'] = le.fit_transform(X_salary['Job_location'])

# Splitting the data into training and testing sets
X_train_salary, X_test_salary, y_train_salary, y_test_salary = train_test_split(X_salary, y_salary, test_size=0.2, random_state=42)

# Scaling the numerical features
scaler_salary = StandardScaler()
X_train_scaled_salary = scaler_salary.fit_transform(X_train_salary)
X_test_scaled_salary = scaler_salary.transform(X_test_salary)

# Initializing regression models
regression_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}

# Training and evaluating models
results_salary = {}
for model_name, model in tqdm(regression_models.items(), desc='Training Salary Prediction Models'):
    model.fit(X_train_scaled_salary, y_train_salary)
    y_pred_salary = model.predict(X_test_scaled_salary)
    mse = mean_squared_error(y_test_salary, y_pred_salary)
    r2 = r2_score(y_test_salary, y_pred_salary)
    results_salary[model_name] = (mse, r2)
    print(f'\n{model_name} [mse: {mse:.2f}, r2: {r2:.2f}]')

# Identifying the best model based on mean squared error (MSE)
best_model_salary = min(results_salary, key=lambda x: results_salary[x][0])
print("Best Model for predicting Salary_med:", best_model_salary)
print("MSE and R2 score:", results_salary[best_model_salary])

Training Salary Prediction Models:   0%|                                                         | 0/4 [00:00<?, ?it/s]


Linear Regression [mse: 1336158652.73, r2: -0.01]

Decision Tree [mse: 337280438.21, r2: 0.74]


Training Salary Prediction Models: 100%|█████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.44it/s]


Random Forest [mse: 338427774.85, r2: 0.74]

Gradient Boosting Regressor [mse: 361802697.74, r2: 0.73]
Best Model for predicting Salary_med: Decision Tree
MSE and R2 score: (337280438.2103085, 0.7449573250225581)





## Predicting Job Location

In [10]:
# Assuming the target variable is 'Job_location'
X_pref_location = df[['Salary_med', 'Experience_level', 'Weekly_commute']]
y_pref_location = df['Job_location']

# Handling categorical features (Job_location) using Label Encoding
y_pref_location = le.fit_transform(y_pref_location)

# Splitting the data into training and testing sets
X_train_pref_location, X_test_pref_location, y_train_pref_location, y_test_pref_location = train_test_split(X_pref_location, y_pref_location, test_size=0.2, random_state=42)

# Scaling the numerical features
scaler_pref_location = StandardScaler()
X_train_scaled_pref_location = scaler_pref_location.fit_transform(X_train_pref_location)
X_test_scaled_pref_location = scaler_pref_location.transform(X_test_pref_location)

# Initializing classification models
classification_models_pref_location = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Training and evaluating models
results_pref_location = {}
for model_name, model in tqdm(classification_models_pref_location.items(), desc='Training Prefered Location Models'):
    model.fit(X_train_scaled_pref_location, y_train_pref_location)
    y_pred_pref_location = model.predict(X_test_scaled_pref_location)
    accuracy_pref_location = accuracy_score(y_test_pref_location, y_pred_pref_location)
    results_pref_location[model_name] = accuracy_pref_location
    print(f'{model_name} Accuracy: {accuracy_pref_location:.2f}')

# Identifying the best model based on accuracy
best_model_pref_location = max(results_pref_location, key=results_pref_location.get)
print("Best Model for predicting Preferred Job Location:", best_model_pref_location)

Training Prefered Location Models:   0%|                                                         | 0/5 [00:00<?, ?it/s]

Logistic Regression Accuracy: 0.44


Training Prefered Location Models:  60%|█████████████████████████████▍                   | 3/5 [00:00<00:00,  5.93it/s]

Random Forest Accuracy: 0.91
SVM Accuracy: 0.58
KNN Accuracy: 0.87


Training Prefered Location Models: 100%|█████████████████████████████████████████████████| 5/5 [00:10<00:00,  2.13s/it]

Gradient Boosting Accuracy: 0.90
Best Model for predicting Preferred Job Location: Random Forest



