In [3]:
# Install xgboost if not already installed
!pip install xgboost




Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 217.9 kB/s eta 0:09:34
   ---------------------------------------- 0.0/124.9 MB 245.8 kB/s eta 0:08:29
   ---------------------------------------- 0.3/124.9 MB 1.5 MB/s eta 0:01:21
   ---------------------------------------- 0.7/124.9 MB 3.1 MB/s eta 0:00:41
   ---------------------------------------- 1.1/124.9 MB 4.0 MB/s eta 0:00:31
   ---------------------------------------- 1.5/124.9 MB 4.9 MB/s eta 0:00:26
    --------------------------------------- 1.9/124.9 MB 5.5 MB/s eta 0:00:23
    --------------------------------------- 2.3/124.9 MB 5.9 MB/s eta 0:00:21
    -------


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler


In [5]:
# Load the fact table (the prepared report data)
fact_table = pd.read_csv('fact_table.csv')

# Display the first few rows to inspect the data
fact_table.head()


Unnamed: 0,userId,total_courses_completed,total_time_spent,average_score,role,department,total_feedback,latest_feedback_time
0,1,17,1982,54.105263,admin,testing,6.0,2024-10-11 01:28:22
1,2,10,982,44.545455,user,devops,5.0,2024-08-30 17:22:19
2,3,12,1339,42.583333,user,devops,4.0,2024-10-12 15:55:43
3,4,13,1752,56.6,admin,full-stack,5.0,2024-10-01 12:30:07
4,5,11,1505,56.0,user,full-stack,5.0,2024-09-23 10:12:35


In [6]:
# Feature Engineering: Create high/low engagement label
def feature_engineering(df):
    # Define high/low engagement based on total_time_spent
    df['engagement_label'] = df['total_time_spent'].apply(lambda x: 1 if x > df['total_time_spent'].median() else 0)
    return df

# Apply feature engineering
fact_table = feature_engineering(fact_table)

# Display the updated DataFrame
fact_table.head()


Unnamed: 0,userId,total_courses_completed,total_time_spent,average_score,role,department,total_feedback,latest_feedback_time,engagement_label
0,1,17,1982,54.105263,admin,testing,6.0,2024-10-11 01:28:22,1
1,2,10,982,44.545455,user,devops,5.0,2024-08-30 17:22:19,0
2,3,12,1339,42.583333,user,devops,4.0,2024-10-12 15:55:43,0
3,4,13,1752,56.6,admin,full-stack,5.0,2024-10-01 12:30:07,1
4,5,11,1505,56.0,user,full-stack,5.0,2024-09-23 10:12:35,0


In [7]:
# Model Building: Predict High/Low Engaged Employees
def build_predictive_model(df):
    # Define features and target variable
    X = df[['total_courses_completed', 'total_time_spent', 'average_score']]
    y = df['engagement_label']

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    # Model: RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Model: XGBoost Classifier
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train, y_train)

    # Random Forest Prediction
    rf_pred = rf_model.predict(X_test)

    # XGBoost Prediction
    xgb_pred = xgb_model.predict(X_test)

    # Evaluate both models
    print("\nRandom Forest Classifier Report:\n", classification_report(y_test, rf_pred))
    print("\nConfusion Matrix - Random Forest:\n", confusion_matrix(y_test, rf_pred))

    print("\nXGBoost Classifier Report:\n", classification_report(y_test, xgb_pred))
    print("\nConfusion Matrix - XGBoost:\n", confusion_matrix(y_test, xgb_pred))

# Call the function to build and evaluate the model
build_predictive_model(fact_table)



Random Forest Classifier Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99        33
           1       1.00      0.96      0.98        27

    accuracy                           0.98        60
   macro avg       0.99      0.98      0.98        60
weighted avg       0.98      0.98      0.98        60


Confusion Matrix - Random Forest:
 [[33  0]
 [ 1 26]]

XGBoost Classifier Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94        33
           1       1.00      0.85      0.92        27

    accuracy                           0.93        60
   macro avg       0.95      0.93      0.93        60
weighted avg       0.94      0.93      0.93        60


Confusion Matrix - XGBoost:
 [[33  0]
 [ 4 23]]
