#Job Classification using a Linear Regression model

In [123]:
#importing all required libraries for this analysis
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [124]:
#loading the dataset into a python dataframe
df = pd.read_csv('jobclassinfo2.csv')

In [125]:
#initial exploratory data analysis (EDA)
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())

   ID  JobFamily    JobFamilyDescription  JobClass JobClassDescription  \
0   1          1  Accounting And Finance         1        Accountant I   
1   2          1  Accounting And Finance         2       Accountant II   
2   3          1  Accounting And Finance         3      Accountant III   
3   4          1  Accounting And Finance         4       Accountant IV   
4   5          2  Administrative Support         5     Admin Support I   

   PayGrade  EducationLevel  Experience  OrgImpact  ProblemSolving  \
0         5               3           1          3               3   
1         6               4           1          5               4   
2         8               4           2          6               5   
3        10               5           5          6               6   
4         1               1           0          1               1   

   Supervision  ContactLevel  FinancialBudget    PG  
0            4             3                5  PG05  
1            5            

In [126]:
#feature selection: choosing the features we want to use for the LDA model.
# Select features
selected_features = ['EducationLevel', 'Experience', 'OrgImpact', 'ProblemSolving', 'Supervision', 'ContactLevel', 'FinancialBudget']
# X = pd.DataFrame(data=df[selected_features], columns=selected_features) - keeping code here for now.

In [127]:
#splitting data into train and test sets.
X = df[selected_features]
y = df['PayGrade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
#creating and training the linear regression model.
model = LinearRegression()
model.fit(X_train, y_train)

In [129]:
#using the trained model to make predictions on the test data.
y_pred = model.predict(X_test)

In [130]:
#evaluating the model's performance through selected metrics.
#mean absolute error measures the average absolute difference between predicted and actual values
#mean squared error measures the average squared difference between predicted and actual values
#r-squared measures the proportion of the variance in the depended variable that is predicted from the independent variables. A higher r2 indicates a better fit.
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae:.2f}')
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

Mean Absolute Error: 0.64
Mean Squared Error: 0.83
R-squared: 0.85


In [131]:
# Access model coefficients (weights)
coef = model.coef_
intercept = model.intercept_

print('Model Coefficients:')
for feature, weight in zip(selected_features, coef):
    print(f'{feature}: {weight:.2f}')

print(f'Intercept: {intercept:.2f}')

Model Coefficients:
EducationLevel: 0.23
Experience: -0.03
OrgImpact: 0.04
ProblemSolving: 1.27
Supervision: -0.16
ContactLevel: -0.08
FinancialBudget: 0.25
Intercept: 0.08


In [132]:
#using the model to predict the paygrade based on the features that we trained and tested on.
# new_job = {
#     'EducationLevel': 3,
#     'Experience': 5,
#     'OrgImpact': 2,
#     'ProblemSolving': 4,
#     'Supervision': 3,
#     'ContactLevel': 6,
#     'FinancialBudget': 7
# }

# Load the scaler used during training
# scaler = StandardScaler()
# scaler.fit(X_train)  # X_train is the training data without the target variable

# Transform the new job description
# new_job_array = scaler.transform([[new_job['EducationLevel'], new_job['Experience'], new_job['OrgImpact'], new_job['ProblemSolving'], new_job['Supervision'], new_job['ContactLevel'], new_job['FinancialBudget']]])

predicted_paygrade = model.predict([[3, 5, 2, 4, 3, 6, 7]])

print(f'Predicted PayGrade: {predicted_paygrade[0]:.2f}')

Predicted PayGrade: 6.53




This analysis is not yet complete.