### Predict the performance of staff in IT-centric jobs and find the attribute that has the highest weight in training the model that predicts performace.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import altair as alt

# Load the dataset
df = pd.read_csv('FAU_Bank_Employee_Performance(1).xls - INX_Future_Inc_Employee_Perform.csv')

# Display the first 5 rows
print("First 5 rows:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Show columns and their types
print("\nColumn Information:")
print(df.info())

# Drop unnecessary columns
df.drop('EmpNumber', axis=1, inplace=True)

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Correlation Analysis
correlation_matrix = df_encoded.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix.to_markdown(numalign="left", stralign="left"))

# Sort correlations with PerformanceRating
correlations_with_performance = correlation_matrix['PerformanceRating'].sort_values(ascending=False)
print("\nCorrelations with PerformanceRating:")
print(correlations_with_performance.to_markdown(numalign="left", stralign="left"))

# Visualize the distribution of Performance Ratings
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('PerformanceRating:Q', bin=True, axis=alt.Axis(title='Performance Rating')),
    y=alt.Y('count()', axis=alt.Axis(title='Number of Employees')),
    tooltip=[alt.Tooltip('PerformanceRating:Q', bin=True), 'count()']
).properties(
    title='Distribution of Employee Performance Ratings'
).interactive()

chart.save('performance_rating_distribution.json')

# Prepare data for modeling
X = df_encoded.drop('PerformanceRating', axis=1)
y = df_encoded['PerformanceRating']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train Random Forest Regressor Model
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)

print("\nModel Evaluation:")
print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred):.2f}')
print(f'Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred):.2f}')
print(f'R-squared: {r2_score(y_test, y_pred):.2f}')

# Feature Importance
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)

# Reset the index of the Series to have valid column names
feature_importances = feature_importances.reset_index()

# Rename the columns for clarity in the chart
feature_importances.columns = ['Feature', 'Importance']

print("\nTop 10 Feature Importances:")
print(feature_importances[:10].to_markdown(numalign="left", stralign="left"))

# Visualize Feature Importance
chart = alt.Chart(feature_importances[:10]).mark_bar().encode(
    x=alt.X('Feature:N', axis=alt.Axis(title='Feature', labelAngle=-45)),
    y=alt.Y('Importance:Q', axis=alt.Axis(title='Importance')),
    tooltip=['Feature', 'Importance']
).properties(
    title='Top 10 Feature Importances'
).interactive()

chart.save('feature_importances_regression.json')


First 5 rows:
| EmpNumber   | Age   | Gender   | EducationBackground   | MaritalStatus   | EmpDepartment   | EmpJobRole      | BusinessTravelFrequency   | DistanceFromHome   | EmpEducationLevel   | EmpEnvironmentSatisfaction   | EmpHourlyRate   | EmpJobInvolvement   | EmpJobLevel   | EmpJobSatisfaction   | NumCompaniesWorked   | OverTime   | EmpLastSalaryHikePercent   | EmpRelationshipSatisfaction   | TotalWorkExperienceInYears   | TrainingTimesLastYear   | EmpWorkLifeBalance   | ExperienceYearsAtThisCompany   | ExperienceYearsInCurrentRole   | YearsSinceLastPromotion   | YearsWithCurrManager   | Attrition   | PerformanceRating   |
|:------------|:------|:---------|:----------------------|:----------------|:----------------|:----------------|:--------------------------|:-------------------|:--------------------|:-----------------------------|:----------------|:--------------------|:--------------|:---------------------|:---------------------|:-----------|:---------------------------|:-

RandomForestClassifier(random_state=0)

In [4]:
# Predict and Evaluate
y_pred = model.predict(X_test)

print("\nModel Evaluation:")
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted"):.2f}')
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)


Model Evaluation:
Accuracy: 0.91
Precision: 0.91
Recall: 0.91

Classification Report:
               precision    recall  f1-score   support

           2       0.92      0.90      0.91        52
           3       0.91      0.98      0.94       266
           4       0.88      0.50      0.64        42

    accuracy                           0.91       360
   macro avg       0.90      0.79      0.83       360
weighted avg       0.91      0.91      0.90       360



In [5]:
# Reset the index of the Series to have valid column names
feature_importances = feature_importances.reset_index()

In [6]:
# Rename the columns for clarity in the chart
feature_importances.columns = ['Feature', 'Importance']

print("\nTop 10 Feature Importances:")
print(feature_importances[:10].to_markdown(numalign="left", stralign="left"))


Top 10 Feature Importances:
|    | Feature                      | Importance   |
|:---|:-----------------------------|:-------------|
| 0  | EmpEnvironmentSatisfaction   | 0.167351     |
| 1  | EmpLastSalaryHikePercent     | 0.166928     |
| 2  | YearsSinceLastPromotion      | 0.0792674    |
| 3  | ExperienceYearsInCurrentRole | 0.0416873    |
| 4  | EmpHourlyRate                | 0.0401247    |
| 5  | Age                          | 0.0368198    |
| 6  | DistanceFromHome             | 0.0351398    |
| 7  | TotalWorkExperienceInYears   | 0.0344292    |
| 8  | ExperienceYearsAtThisCompany | 0.0322386    |
| 9  | YearsWithCurrManager         | 0.0309933    |


In [10]:
# Visualize Feature Importance
chart = alt.Chart(feature_importances[:10]).mark_bar().encode(
    x=alt.X('Feature:N', axis=alt.Axis(title='Feature', labelAngle=-45)),
    y=alt.Y('Importance:Q', axis=alt.Axis(title='Importance')),
    tooltip=['Feature', 'Importance']
).properties(
    title='Top 10 Feature Importances'
).interactive()

chart.save('feature_importances.json')
chart.display()