In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Suppress warnings for cleaner output during execution
import warnings
warnings.filterwarnings('ignore')

In [29]:
# --- Install missing libraries ---
# These commands will attempt to install the necessary libraries if they are not already present.
# If running in a non-interactive environment (e.g., a standard Python script),
# you might need to run 'pip install <library_name>' manually in your terminal/command prompt.
try:
    import xgboost
except ImportError:
    print("xgboost not found. Attempting to install...")
    try:
        !pip install xgboost
        import xgboost # Try importing again after installation
        print("xgboost installed successfully.")
    except Exception as e:
        print(f"Failed to install xgboost: {e}")
        print("Please try running 'pip install xgboost' in your terminal/command prompt.")
        # Exit or handle gracefully if installation fails and model cannot proceed
        # For this script, we'll continue, but subsequent model steps will fail if imports aren't resolved.


try:
    import catboost
except ImportError:
    print("catboost not found. Attempting to install...")
    try:
        !pip install catboost
        import catboost # Try importing again after installation
        print("catboost installed successfully.")
    except Exception as e:
        print(f"Failed to install catboost: {e}")
        print("Please try running 'pip install catboost' in your terminal/command prompt.")


try:
    import lightgbm
except ImportError:
    print("lightgbm not found. Attempting to install...")
    try:
        !pip install lightgbm
        import lightgbm # Try importing again after installation
        print("lightgbm installed successfully.")
    except Exception as e:
        print(f"Failed to install lightgbm: {e}")
        print("Please try running 'pip install lightgbm' in your terminal/command prompt.")



xgboost not found. Attempting to install...
Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 1.9 MB/s eta 0:01:20
   ---------------------------------------- 1.0/150.0 MB 2.0 MB/s eta 0:01:14
   ---------------------------------------- 1.6/150.0 MB 2.1 MB/s eta 0:01:11
    --------------------------------------- 2.1/150.0 MB 2.2 MB/s eta 0:01:08
    --------------------------------------- 2.6/150.0 MB 2.2 MB/s eta 0:01:07
    --------------------------------------- 3.1/150.0 MB 2.3 MB/s eta 0:01:03
    --------------------------------------- 3.7/150.0 MB 2.4 MB/s eta 0:01:01
   - -------------------------------------- 4.5/1

### --- 1. Data Loading ---

In [33]:
print("--- 1. Data Loading ---")
try:
    df = pd.read_csv('Test_data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'Test_data.csv' not found. Please make sure the file is in the same directory as the script.")
    exit()

--- 1. Data Loading ---
Dataset loaded successfully.


### --- 2. Data Preprocessing ---

In [36]:
print("\n--- 2. Data Preprocessing ---")

# Drop 'employee_id' as it's an identifier
df = df.drop('employee_id', axis=1)

# Handle missing values for 'previous_year_rating'
if 'previous_year_rating' in df.columns and df['previous_year_rating'].isnull().any():
    mode_rating = df['previous_year_rating'].mode()[0]
    df['previous_year_rating'] = df['previous_year_rating'].fillna(mode_rating)
    print(f"Missing values in 'previous_year_rating' imputed with mode: {mode_rating}")
else:
    print("'previous_year_rating' column is either not present or has no missing values.")

# Define features (X) and target (y)
# The target variable is 'KPIs_met >80%'
X = df.drop('KPIs_met >80%', axis=1)
y = df['KPIs_met >80%']

print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")
print(f"Target variable value counts:\n{y.value_counts()}")
print(f"Target variable percentage:\n{y.value_counts(normalize=True) * 100}")

# Identify categorical and numerical features for encoding
categorical_features = X.select_dtypes(include='object').columns
numerical_features = X.select_dtypes(include=np.number).columns

# Create a column transformer for preprocessing
# One-hot encode categorical features, pass numerical features without transformation
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# Split data into training and testing sets (stratified for target variable)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")



--- 2. Data Preprocessing ---
Missing values in 'previous_year_rating' imputed with mode: 3.0
Shape of features (X): (23490, 11)
Shape of target (y): (23490,)
Target variable value counts:
KPIs_met >80%
0    15061
1     8429
Name: count, dtype: int64
Target variable percentage:
KPIs_met >80%
0    64.116645
1    35.883355
Name: proportion, dtype: float64

Training set shape: (18792, 11), (18792,)
Test set shape: (4698, 11), (4698,)


### --- 3. Model Training and Evaluation ---

In [38]:
print("\n--- 3. Model Training and Evaluation ---")

# Define a function to evaluate model performance
def evaluate_model(model_name, y_true, y_pred, y_prob=None):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else 'N/A'
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n--- {model_name} Performance ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}" if y_prob is not None else "ROC AUC Score: N/A")
    print("Confusion Matrix:\n", cm)

    # Plot Confusion Matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.savefig(f'{model_name}_confusion_matrix.png')
    plt.close()
    print(f"Confusion Matrix saved as '{model_name}_confusion_matrix.png'")




--- 3. Model Training and Evaluation ---


### --- XGBoost Model ---

In [40]:
print("\nBuilding XGBoost Model...")
# Ensure imports are within a try-except block in case installation fails
try:
    from xgboost import XGBClassifier
    xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42))])
    xgb_pipeline.fit(X_train, y_train)
    y_pred_xgb = xgb_pipeline.predict(X_test)
    y_prob_xgb = xgb_pipeline.predict_proba(X_test)[:, 1]
    evaluate_model("XGBoost", y_test, y_pred_xgb, y_prob_xgb)
except ImportError:
    print("XGBoost model skipped due to missing library. Please install it to run this part.")
except Exception as e:
    print(f"An error occurred during XGBoost model training/evaluation: {e}")


Building XGBoost Model...

--- XGBoost Performance ---
Accuracy: 0.6950
Precision: 0.6041
Recall: 0.4353
F1-Score: 0.5060
ROC AUC Score: 0.7133
Confusion Matrix:
 [[2531  481]
 [ 952  734]]
Confusion Matrix saved as 'XGBoost_confusion_matrix.png'


### --- CatBoost Model ---

In [43]:
print("\nBuilding CatBoost Model...")
try:
    from catboost import CatBoostClassifier
    cat_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', CatBoostClassifier(iterations=100,
                                                                     random_seed=42,
                                                                     verbose=False,
                                                                     eval_metric='Accuracy',
                                                                     custom_metric=['Precision', 'Recall', 'F1', 'AUC']
                                                                    ))])
    cat_pipeline.fit(X_train, y_train)
    y_pred_cat = cat_pipeline.predict(X_test)
    y_prob_cat = cat_pipeline.predict_proba(X_test)[:, 1]
    evaluate_model("CatBoost", y_test, y_pred_cat, y_prob_cat)
except ImportError:
    print("CatBoost model skipped due to missing library. Please install it to run this part.")
except Exception as e:
    print(f"An error occurred during CatBoost model training/evaluation: {e}")


Building CatBoost Model...

--- CatBoost Performance ---
Accuracy: 0.7031
Precision: 0.6228
Recall: 0.4377
F1-Score: 0.5141
ROC AUC Score: 0.7220
Confusion Matrix:
 [[2565  447]
 [ 948  738]]
Confusion Matrix saved as 'CatBoost_confusion_matrix.png'


### --- LightGBM Model ---

In [46]:
print("\nBuilding LightGBM Model...")
try:
    from lightgbm import LGBMClassifier
    lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', LGBMClassifier(random_state=42))])
    lgbm_pipeline.fit(X_train, y_train)
    y_pred_lgbm = lgbm_pipeline.predict(X_test)
    y_prob_lgbm = lgbm_pipeline.predict_proba(X_test)[:, 1]
    evaluate_model("LightGBM", y_test, y_pred_lgbm, y_prob_lgbm)
except ImportError:
    print("LightGBM model skipped due to missing library. Please install it to run this part.")
except Exception as e:
    print(f"An error occurred during LightGBM model training/evaluation: {e}")


print("\nAll models trained and evaluated. Performance metrics and confusion matrices are displayed and saved.")


Building LightGBM Model...
[LightGBM] [Info] Number of positive: 6743, number of negative: 12049
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 18792, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.358823 -> initscore=-0.580477
[LightGBM] [Info] Start training from score -0.580477

--- LightGBM Performance ---
Accuracy: 0.7020
Precision: 0.6246
Recall: 0.4253
F1-Score: 0.5060
ROC AUC Score: 0.7218
Confusion Matrix:
 [[2581  431]
 [ 969  717]]
Confusion Matrix saved as 'LightGBM_confusion_matrix.png'

All models trained and evaluated. Performance metrics and confusion matrices are displayed and saved.
