In [3]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier,GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

In [2]:
pip install tqdm





In [5]:
# Load the dataset
#data_path = '/content/drive/MyDrive/Cyber_Datav1.csv'
df = pd.read_csv('Cyber_Datav1.csv')

In [6]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,Title,Job_location,Salary_low,Salary_med,Salary_upper,Remote,Hybrid,On_prem,Source,Work_env,...,Mid,Senior,Executive,Experience_level,Salary_s,Commute_S,Per_Crime_S,Prop_Crime_S,Stress,Candidate
0,Information Security Analyst,"Washington, DC",55000.0,67500.0,80000,1,0,0,Indeed,1,...,0,0,0,1,0,0,0,0,0,0
1,Security Systems Analyst | REMOTE,"Washington, DC",84500.0,95750.0,107000,1,0,0,Indeed,1,...,1,0,0,2,1,0,1,1,3,1
2,Intermediate Information Security Analyst,"Washington, DC",75000.0,82500.0,90000,1,0,0,Indeed,1,...,0,0,0,1,0,0,1,0,1,0
3,Cybersecurity Analyst,"Washington, DC",55700.0,63150.0,70600,1,0,0,Indeed,1,...,0,0,0,1,1,0,0,0,1,0
4,Cloud Security Architect,"Washington, DC",120000.0,140000.0,160000,1,0,0,Indeed,1,...,0,1,0,3,1,0,0,0,1,0


In [8]:
# Get an overview of the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Title             1218 non-null   object 
 1   Job_location      1218 non-null   object 
 2   Salary_low        1218 non-null   float64
 3   Salary_med        1218 non-null   float64
 4   Salary_upper      1218 non-null   int64  
 5   Remote            1218 non-null   int64  
 6   Hybrid            1218 non-null   int64  
 7   On_prem           1218 non-null   int64  
 8   Source            1218 non-null   object 
 9   Work_env          1218 non-null   int64  
 10  Buy_rent          1218 non-null   int64  
 11  Home_location     1218 non-null   object 
 12  Commute           1218 non-null   int64  
 13  Weekly_commute    1218 non-null   int64  
 14  Person_crime      1218 non-null   int64  
 15  Prop_crime        1218 non-null   int64  
 16  Entry             1218 non-null   int64  


In [9]:
# Summary statistics
df.describe()

Unnamed: 0,Salary_low,Salary_med,Salary_upper,Remote,Hybrid,On_prem,Work_env,Buy_rent,Commute,Weekly_commute,...,Mid,Senior,Executive,Experience_level,Salary_s,Commute_S,Per_Crime_S,Prop_Crime_S,Stress,Candidate
count,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,...,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0
mean,102999.188342,121519.643842,140040.099343,0.188013,0.08046,0.731527,2.543514,0.399015,90.01642,630.114943,...,0.414614,0.337438,0.090312,2.360427,0.493432,0.578818,0.477011,0.112479,1.661741,0.247126
std,37151.283342,38497.514516,43549.085435,0.390883,0.272115,0.443347,0.790345,0.489897,69.292549,485.047846,...,0.492858,0.47303,0.286746,0.852649,0.500162,0.493951,0.499676,0.316085,1.130236,0.431518
min,43000.0,56000.0,60000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,73000.0,92000.0,112000.0,0.0,0.0,0.0,2.0,0.0,18.0,126.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,94199.0,112000.0,133000.0,0.0,0.0,1.0,3.0,0.0,105.0,735.0,...,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0
75%,130000.0,152500.0,170000.0,0.0,0.0,1.0,3.0,1.0,145.0,1015.0,...,1.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,2.0,0.0
max,250000.0,250000.0,260000.0,1.0,1.0,1.0,3.0,1.0,225.0,1575.0,...,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,1.0


In [10]:
# DATA PREPROCESSING
# Check for missing values
print(df.isnull().sum())

# Since the dataset is already clean and there are no missing values, no further preprocessing is needed for this step.
# However, we may need to encode categorical variables for certain machine learning algorithms.


Title               0
Job_location        0
Salary_low          0
Salary_med          0
Salary_upper        0
Remote              0
Hybrid              0
On_prem             0
Source              0
Work_env            0
Buy_rent            0
Home_location       0
Commute             0
Weekly_commute      0
Person_crime        0
Prop_crime          0
Entry               0
Mid                 0
Senior              0
Executive           0
Experience_level    0
Salary_s            0
Commute_S           0
Per_Crime_S         0
Prop_Crime_S        0
Stress              0
Candidate           0
dtype: int64


# ML Modeling

### Predicting Salary_med

In [11]:
pd.set_option('mode.chained_assignment', None)

# Assuming the target variable is 'Salary_med'
X_salary = df[['Experience_level', 'Stress', 'Job_location', 'Commute_S']]
y_salary = df['Salary_med']

le = LabelEncoder()

# Handling categorical features (Job_location) using Label Encoding
X_salary['Job_location'] = le.fit_transform(X_salary['Job_location'])

# Splitting the data into training and testing sets
X_train_salary, X_test_salary, y_train_salary, y_test_salary = train_test_split(X_salary, y_salary, test_size=0.2, random_state=42)

# Scaling the numerical features
scaler_salary = StandardScaler()
X_train_scaled_salary = scaler_salary.fit_transform(X_train_salary)
X_test_scaled_salary = scaler_salary.transform(X_test_salary)

# Initializing regression models
regression_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}

# Training and evaluating models
results_salary = {}
for model_name, model in tqdm(regression_models.items(), desc='Training Salary Prediction Models'):
    model.fit(X_train_scaled_salary, y_train_salary)
    y_pred_salary = model.predict(X_test_scaled_salary)
    mse = mean_squared_error(y_test_salary, y_pred_salary)
    r2 = r2_score(y_test_salary, y_pred_salary)
    results_salary[model_name] = (mse, r2)
    print(f'\n{model_name} [mse: {mse:.2f}, r2: {r2:.2f}]')

# Identifying the best model based on mean squared error (MSE)
best_model_salary = min(results_salary, key=lambda x: results_salary[x][0])
print("Best Model for predicting Salary_med:", best_model_salary)
print("MSE and R2 score:", results_salary[best_model_salary])

Training Salary Prediction Models: 100%|█████████████████████████████████████████████████| 4/4 [00:00<00:00, 26.81it/s]


Linear Regression [mse: 126039139.28, r2: 0.91]

Decision Tree [mse: 14138486.02, r2: 0.99]

Random Forest [mse: 16608519.54, r2: 0.99]

Gradient Boosting Regressor [mse: 26533742.04, r2: 0.98]
Best Model for predicting Salary_med: Decision Tree
MSE and R2 score: (14138486.016645782, 0.9902374912035732)





In [12]:
# Provide a sample input for predicting Salary_med
sample_input_salary = pd.DataFrame({
    'Experience_level': [5],
    'Stress': [2],
    'Job_location': ['Washington, DC'],
    'Commute_S': [45]
})

# Preprocess the sample input
sample_input_salary['Job_location'] = le.transform(sample_input_salary['Job_location'])

# Scale the numerical features
sample_input_scaled_salary = scaler_salary.transform(sample_input_salary)

# Predict using the best model
best_model = regression_models[best_model_salary]
prediction_salary = best_model.predict(sample_input_scaled_salary)

print("Predicted Salary_med:", prediction_salary[0])

Predicted Salary_med: 180000.0



### Predicting the Risk of Individual Leaving the Job


In [13]:
# Assuming the target variable is 'Candidate' (1 for Yes, 0 for No)
X_leave_job = df[['Salary_med', 'Experience_level', 'Stress', 'Job_location', 'Commute_S']]
y_leave_job = df['Candidate']

# Handling categorical features (Job_location) using Label Encoding
X_leave_job['Job_location'] = le.fit_transform(X_leave_job['Job_location'])

# Splitting the data into training and testing sets
X_train_leave_job, X_test_leave_job, y_train_leave_job, y_test_leave_job = train_test_split(X_leave_job, y_leave_job, test_size=0.2, random_state=42)

# Scaling the numerical features
scaler_leave_job = StandardScaler()
X_train_scaled_leave_job = scaler_leave_job.fit_transform(X_train_leave_job)
X_test_scaled_leave_job = scaler_leave_job.transform(X_test_leave_job)

# Initializing classification models
classification_models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()

}

# Training and evaluating models
results_leave_job = {}
for model_name, model in tqdm(classification_models.items(), desc='Training \'At Risk of leaving\' Models'):
    model.fit(X_train_scaled_leave_job, y_train_leave_job)
    y_pred_leave_job = model.predict(X_test_scaled_leave_job)
    accuracy_leave_job = accuracy_score(y_test_leave_job, y_pred_leave_job)
    results_leave_job[model_name] = accuracy_leave_job
    print(f'\n{model_name} Accuracy: {accuracy_leave_job:.2f}')



# Identifying the best model based on accuracy
best_model_leave_job = max(results_leave_job, key=results_leave_job.get)
print("Best Model for predicting the Risk of Individual Leaving the Job:", best_model_leave_job)


Training 'At Risk of leaving' Models: 100%|██████████████████████████████████████████████| 5/5 [00:00<00:00, 35.27it/s]


Logistic Regression Accuracy: 1.00

Random Forest Accuracy: 1.00

SVM Accuracy: 1.00

KNN Accuracy: 1.00

Gradient Boosting Accuracy: 1.00
Best Model for predicting the Risk of Individual Leaving the Job: Logistic Regression





In [14]:
# Provide a sample input for predicting the Risk of Individual Leaving the Job
sample_input_leave_job = pd.DataFrame({
    'Salary_med': [100000],
    'Experience_level': [5],
    'Stress': [2],
    'Job_location': ['Washington, DC'],
    'Commute_S': [45]
})

# Preprocess the sample input
sample_input_leave_job['Job_location'] = le.transform(sample_input_leave_job['Job_location'])

# Scale the numerical features
sample_input_scaled_leave_job = scaler_leave_job.transform(sample_input_leave_job)

# Predict using the best model
best_model = classification_models[best_model_leave_job]
prediction_leave_job = best_model.predict(sample_input_scaled_leave_job)

print("Predicted Risk of Individual Leaving the Job:", prediction_leave_job[0])

Predicted Risk of Individual Leaving the Job: 1



### Predicting Preferred Job Location


In [15]:
# Assuming the target variable is 'Job_location'
X_pref_location = df[['Salary_med', 'Experience_level', 'Stress', 'Commute_S']]
y_pref_location = df['Job_location']

# Handling categorical features (Job_location) using Label Encoding
y_pref_location = le.fit_transform(y_pref_location)

# Splitting the data into training and testing sets
X_train_pref_location, X_test_pref_location, y_train_pref_location, y_test_pref_location = train_test_split(X_pref_location, y_pref_location, test_size=0.2, random_state=42)

# Scaling the numerical features
scaler_pref_location = StandardScaler()
X_train_scaled_pref_location = scaler_pref_location.fit_transform(X_train_pref_location)
X_test_scaled_pref_location = scaler_pref_location.transform(X_test_pref_location)

# Initializing classification models
classification_models_pref_location = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Training and evaluating models
results_pref_location = {}
for model_name, model in tqdm(classification_models_pref_location.items(), desc='Training Prefered Location Models'):
    model.fit(X_train_scaled_pref_location, y_train_pref_location)
    y_pred_pref_location = model.predict(X_test_scaled_pref_location)
    accuracy_pref_location = accuracy_score(y_test_pref_location, y_pred_pref_location)
    results_pref_location[model_name] = accuracy_pref_location
    print(f'{model_name} Accuracy: {accuracy_pref_location:.2f}')

# Identifying the best model based on accuracy
best_model_pref_location = max(results_pref_location, key=results_pref_location.get)
print("Best Model for predicting Preferred Job Location:", best_model_pref_location)

Training Prefered Location Models:  40%|███████████████████▌                             | 2/5 [00:00<00:00, 12.38it/s]

Logistic Regression Accuracy: 0.47
Random Forest Accuracy: 0.81
SVM Accuracy: 0.58
KNN Accuracy: 0.79


Training Prefered Location Models: 100%|█████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.02it/s]

Gradient Boosting Accuracy: 0.55
Best Model for predicting Preferred Job Location: Random Forest





In [16]:
# Provide a sample input for predicting Preferred Job Location
sample_input_pref_location = pd.DataFrame({
    'Salary_med': [100000],
    'Experience_level': [5],
    'Stress': [2],
    'Commute_S': [45]
})

# Scale the numerical features
sample_input_scaled_pref_location = scaler_pref_location.transform(sample_input_pref_location)

# Predict using the best model
best_model = classification_models_pref_location[best_model_pref_location]
prediction_pref_location = best_model.predict(sample_input_scaled_pref_location)

# Reverse transform the label encoded prediction to get the original job location
predicted_location = le.inverse_transform(prediction_pref_location)[0]
print("Predicted Preferred Job Location:", predicted_location)

Predicted Preferred Job Location: Columbia, MD
