In [None]:
!pip install xgboost
!pip install imblearn

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.2.post1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.26.2.post1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (291.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.7/291.7 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.2.post1 xgboost-3.0.0
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-an

In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define parameters
n_samples = 20000
class_ratio = 0.6  # 60% Class 1, 40% Class 0
n_class_1 = int(n_samples * class_ratio)
n_class_0 = n_samples - n_class_1

# Initialize DataFrame
df = pd.DataFrame({
    'Employee_ID': range(1, n_samples + 1),
    'High_Performance': np.concatenate([
        np.ones(n_class_1, dtype=int),
        np.zeros(n_class_0, dtype=int)
    ])
})

# Shuffle High_Performance
df['High_Performance'] = df['High_Performance'].sample(frac=1, random_state=42).values

# Generate numerical features with increased overlap
df['Post_Training_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(84, 8, n_samples), 80, 100),
    np.clip(np.random.normal(70, 10, n_samples), 50, 79)
)

df['Pre_Training_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(70, 10, n_samples), 50, 90),
    np.clip(np.random.normal(64, 10, n_samples), 40, 80)
)

df['Engagement (hrs)'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(13, 4, n_samples), 7, 20),
    np.clip(np.random.normal(10, 4, n_samples), 5, 18)
)

df['Manager_Support_Rating'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.7, 0.8, n_samples), 2, 5),
    np.clip(np.random.normal(3.0, 0.9, n_samples), 1, 4.8)
)

df['Sentiment_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(0.7, 0.2, n_samples), 0.4, 1.0),
    np.clip(np.random.normal(0.5, 0.25, n_samples), 0, 0.9)
)

# Add random noise to Sentiment_Score
df['Sentiment_Score'] += np.random.normal(0, 0.05, n_samples)
df['Sentiment_Score'] = np.clip(df['Sentiment_Score'], 0, 1)

# Generate categorical features
df['Feedback'] = np.where(df['Sentiment_Score'] >= 0.6, 'Positive', 'Negative')

departments = ['IT', 'Sales', 'HR', 'Operations', 'Finance', 'Marketing']
dept_probs = {
    'IT': [0.65, 0.35],
    'Sales': [0.6, 0.4],
    'HR': [0.55, 0.45],
    'Operations': [0.6, 0.4],
    'Finance': [0.55, 0.45],
    'Marketing': [0.6, 0.4]
}
df['Department'] = np.random.choice(departments, n_samples, p=[1/len(departments)]*len(departments))

training_programs = ['Tech', 'Leadership', 'Sales', 'Operations', 'General']
train_probs = {
    'Tech': [0.7, 0.3],
    'Leadership': [0.65, 0.35],
    'Sales': [0.6, 0.4],
    'Operations': [0.6, 0.4],
    'General': [0.5, 0.5]
}
df['Training_Program'] = np.random.choice(training_programs, n_samples, p=[1/len(training_programs)]*len(training_programs))

learning_styles = ['Visual', 'Auditory', 'Kinesthetic']
learn_probs = {
    'Visual': [0.6, 0.4],
    'Auditory': [0.55, 0.45],
    'Kinesthetic': [0.5, 0.5]
}
df['Learning_Style'] = np.random.choice(learning_styles, n_samples, p=[1/3, 1/3, 1/3])

difficulties = ['Easy', 'Medium', 'Hard']
diff_probs = {
    'Easy': [0.7, 0.3],
    'Medium': [0.55, 0.45],
    'Hard': [0.45, 0.55]
}
df['Training_Difficulty'] = np.random.choice(difficulties, n_samples, p=[1/3, 1/3, 1/3])

# Generate new columns with increased overlap
df['Trainer_Quality'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.7, 0.8, n_samples), 2, 5),
    np.clip(np.random.normal(3.0, 0.9, n_samples), 1, 4.8)
)

df['Engagement_Support_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(45, 15, n_samples), 15, 80),
    np.clip(np.random.normal(30, 12, n_samples), 5, 60)
)

df['Score_Progress_Indicator'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(0.4, 0.2, n_samples), 0.1, 0.8),
    np.clip(np.random.normal(0.3, 0.2, n_samples), 0, 0.6)
)

df['Training_Effectiveness'] = np.where(
    df['Training_Difficulty'] == 'Easy',
    df['Trainer_Quality'] * 1.1,
    np.where(
        df['Training_Difficulty'] == 'Medium',
        df['Trainer_Quality'] * 1.0,
        df['Trainer_Quality'] * 0.9
    )
)
df['Training_Effectiveness'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.8, 0.9, n_samples), 2, 5.5),
    np.clip(np.random.normal(2.7, 1.0, n_samples), 1, 4.5)
)

df['Motivation_Index'] = df['Sentiment_Score'] * 5
df['Motivation_Index'] += np.where(
    df['Learning_Style'] == 'Visual', 0.3,
    np.where(df['Learning_Style'] == 'Auditory', 0.1, 0.0)
)
df['Motivation_Index'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.6, 0.8, n_samples), 2, 5),
    np.clip(np.random.normal(2.5, 1.0, n_samples), 0, 4.5)
)

# Add noise to Motivation_Index
df['Motivation_Index'] += np.random.normal(0, 0.1, n_samples)
df['Motivation_Index'] = np.clip(df['Motivation_Index'], 0, 5)

# Adjust High_Performance based on categorical features
for cat, probs in [(df['Department'], dept_probs), (df['Training_Program'], train_probs),
                   (df['Learning_Style'], learn_probs), (df['Training_Difficulty'], diff_probs)]:
    for value in probs.keys():
        mask = cat == value
        if mask.sum() > 0:
            df.loc[mask, 'High_Performance'] = np.random.choice(
                [1, 0], size=mask.sum(), p=probs[value]
            )

# Ensure class balance
class_counts = df['High_Performance'].value_counts()
if class_counts.get(1, 0) < n_class_1:
    mask = (df['High_Performance'] == 0) & (np.random.rand(n_samples) < (n_class_1 - class_counts.get(1, 0)) / class_counts.get(0, n_samples))
    df.loc[mask, 'High_Performance'] = 1
elif class_counts.get(1, 0) > n_class_1:
    mask = (df['High_Performance'] == 1) & (np.random.rand(n_samples) < (class_counts.get(1, 0) - n_class_1) / class_counts.get(1, 0))
    df.loc[mask, 'High_Performance'] = 0

# Recompute numerical features to align with final High_Performance
df['Post_Training_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(84, 8, n_samples), 80, 100),
    np.clip(np.random.normal(70, 10, n_samples), 50, 79)
)
df['Pre_Training_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(70, 10, n_samples), 50, 90),
    np.clip(np.random.normal(64, 10, n_samples), 40, 80)
)
df['Engagement (hrs)'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(13, 4, n_samples), 7, 20),
    np.clip(np.random.normal(10, 4, n_samples), 5, 18)
)
df['Manager_Support_Rating'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.7, 0.8, n_samples), 2, 5),
    np.clip(np.random.normal(3.0, 0.9, n_samples), 1, 4.8)
)
df['Sentiment_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(0.7, 0.2, n_samples), 0.4, 1.0),
    np.clip(np.random.normal(0.5, 0.25, n_samples), 0, 0.9)
)
df['Feedback'] = np.where(df['Sentiment_Score'] >= 0.6, 'Positive', 'Negative')
df['Trainer_Quality'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.7, 0.8, n_samples), 2, 5),
    np.clip(np.random.normal(3.0, 0.9, n_samples), 1, 4.8)
)
df['Engagement_Support_Score'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(45, 15, n_samples), 15, 80),
    np.clip(np.random.normal(30, 12, n_samples), 5, 60)
)
df['Score_Progress_Indicator'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(0.4, 0.2, n_samples), 0.1, 0.8),
    np.clip(np.random.normal(0.3, 0.2, n_samples), 0, 0.6)
)
df['Training_Effectiveness'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.8, 0.9, n_samples), 2, 5.5),
    np.clip(np.random.normal(2.7, 1.0, n_samples), 1, 4.5)
)
df['Motivation_Index'] = np.where(
    df['High_Performance'] == 1,
    np.clip(np.random.normal(3.6, 0.8, n_samples), 2, 5),
    np.clip(np.random.normal(2.5, 1.0, n_samples), 0, 4.5)
)

# Reorder columns
df = df[[
    'Employee_ID', 'Department', 'Training_Program', 'Pre_Training_Score',
    'Post_Training_Score', 'Feedback', 'Engagement (hrs)', 'Sentiment_Score',
    'Manager_Support_Rating', 'Learning_Style', 'Training_Difficulty',
    'Trainer_Quality', 'Engagement_Support_Score', 'Score_Progress_Indicator',
    'Training_Effectiveness', 'Motivation_Index', 'High_Performance'
]]

# Save to CSV
df.to_csv('employee_training_data_large.csv', index=False)
print("Balanced dataset with realistic noise generated and saved to 'employee_training_data_large.csv'")
print(f"Dataset shape: {df.shape}")
print(f"Class distribution: \n{df['High_Performance'].value_counts(normalize=True)}")
print("Columns:", list(df.columns))

Balanced dataset with realistic noise generated and saved to 'employee_training_data_large.csv'
Dataset shape: (20000, 17)
Class distribution: 
High_Performance
1    0.6001
0    0.3999
Name: proportion, dtype: float64
Columns: ['Employee_ID', 'Department', 'Training_Program', 'Pre_Training_Score', 'Post_Training_Score', 'Feedback', 'Engagement (hrs)', 'Sentiment_Score', 'Manager_Support_Rating', 'Learning_Style', 'Training_Difficulty', 'Trainer_Quality', 'Engagement_Support_Score', 'Score_Progress_Indicator', 'Training_Effectiveness', 'Motivation_Index', 'High_Performance']


In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.9/232.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading statsmodels-0.14.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected package

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import classification_report, accuracy_score
from category_encoders import TargetEncoder

# Load dataset
df = pd.read_csv('employee_training_data_large.csv')

# Define features
features = [
    'Department', 'Training_Program', 'Pre_Training_Score', 'Engagement (hrs)',
    'Sentiment_Score', 'Manager_Support_Rating', 'Learning_Style',
    'Training_Difficulty', 'Feedback', 'Trainer_Quality',
    'Engagement_Support_Score', 'Score_Progress_Indicator',
    'Training_Effectiveness', 'Motivation_Index'
]
categorical_cols = ['Department', 'Training_Program', 'Learning_Style', 'Training_Difficulty', 'Feedback']
numerical_cols = [
    'Pre_Training_Score', 'Engagement (hrs)', 'Sentiment_Score',
    'Manager_Support_Rating', 'Trainer_Quality', 'Engagement_Support_Score',
    'Score_Progress_Indicator', 'Training_Effectiveness', 'Motivation_Index'
]

# Define targets
reg_target = 'Post_Training_Score'
class_target = 'High_Performance'

# Drop rows with missing values
df = df.dropna(subset=features + [reg_target, class_target])

# Split data
X = df[features]
y_reg = df[reg_target]
y_class = df[class_target]
X_train, X_test, y_reg_train, y_reg_test, y_class_train, y_class_test = train_test_split(
    X, y_reg, y_class, test_size=0.2, random_state=42
)

# Preprocessing pipeline
preprocessor = Pipeline([
    ('encoder', TargetEncoder(cols=categorical_cols)),
    ('scaler', StandardScaler())
])

# Define regression models
reg_models = {
    'Linear Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'Random Forest Regressor': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=100,
            max_depth=10,
            random_state=42
        ))
    ]),
    'Gradient Boosting Regressor': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ))
    ])
}

# Define classification models
class_models = {
    'Logistic Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(C=1.0, max_iter=1000, random_state=42))
    ]),
    'Random Forest Classifier': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        ))
    ]),
    'XGBoost Classifier': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            scale_pos_weight=1,
            eval_metric='logloss',
            random_state=42
        ))
    ])
}

# Train and evaluate regression models
reg_results = []
reg_feature_importances = []
for name, model in reg_models.items():
    model.fit(X_train, y_reg_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_reg_test, y_pred)
    mae = mean_absolute_error(y_reg_test, y_pred)
    mse = mean_squared_error(y_reg_test, y_pred)
    reg_results.append({
        'Model': name,
        'R2 Score': r2,
        'MAE': mae,
        'MSE': mse
    })
    if name in ['Random Forest Regressor', 'Gradient Boosting Regressor']:
        importances = model.named_steps['regressor'].feature_importances_
        for feature, importance in zip(features, importances):
            reg_feature_importances.append({
                'Model': name,
                'Feature': feature,
                'Importance': importance
            })

# Train and evaluate classification models
class_results = []
class_feature_importances = []
for name, model in class_models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_class_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_class_test, y_pred)
    if accuracy < 0.95:
        print(f"Warning: {name} accuracy ({accuracy:.4f}) is below 95%.")
    report = classification_report(y_class_test, y_pred, output_dict=True, zero_division=0)
    class_results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision_0': report['0']['precision'],
        'Recall_0': report['0']['recall'],
        'F1_0': report['0']['f1-score'],
        'Support_0': report['0']['support'],
        'Precision_1': report['1']['precision'],
        'Recall_1': report['1']['recall'],
        'F1_1': report['1']['f1-score'],
        'Support_1': report['1']['support']
    })
    if name in ['Random Forest Classifier', 'XGBoost Classifier']:
        importances = model.named_steps['classifier'].feature_importances_
        for feature, importance in zip(features, importances):
            class_feature_importances.append({
                'Model': name,
                'Feature': feature,
                'Importance': importance
            })

# Create results DataFrames
reg_results_df = pd.DataFrame(reg_results)
class_results_df = pd.DataFrame(class_results)
reg_importances_df = pd.DataFrame(reg_feature_importances)
class_importances_df = pd.DataFrame(class_feature_importances)

# Save results to CSV
output_df = pd.concat([
    reg_results_df,
    pd.DataFrame([{}]),
    class_results_df,
    pd.DataFrame([{}]),
    reg_importances_df,
    pd.DataFrame([{}]),
    class_importances_df
], ignore_index=True)
output_df.to_csv('model_results.csv', index=False)

# Print summary
print("Regression Model Performance:")
print(reg_results_df.to_string(index=True))
print("\nClassification Model Performance:")
print(class_results_df.to_string(index=True))
print("\nResults and feature importances saved to 'model_results.csv'")

Training Logistic Regression...
Training Random Forest Classifier...
Training XGBoost Classifier...
Regression Model Performance:
                         Model  R2 Score       MAE        MSE
0            Linear Regression  0.365745  6.331399  66.723147
1      Random Forest Regressor  0.455646  5.922787  57.265536
2  Gradient Boosting Regressor  0.473488  5.924666  55.388632

Classification Model Performance:
                      Model  Accuracy  Precision_0  Recall_0      F1_0  Support_0  Precision_1  Recall_1      F1_1  Support_1
0       Logistic Regression   0.93200     0.921656  0.906642  0.914087     1596.0     0.938683  0.948835  0.943732     2404.0
1  Random Forest Classifier   0.95825     0.982444  0.911654  0.945726     1596.0     0.944025  0.989185  0.966078     2404.0
2        XGBoost Classifier   0.97325     0.980013  0.952381  0.965999     1596.0     0.968967  0.987105  0.977952     2404.0

Results and feature importances saved to 'model_results.csv'


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Define classification results from provided output
class_data = {
    'Model': ['Logistic Regression', 'Random Forest Classifier', 'XGBoost Classifier'],
    'Accuracy': [0.93200, 0.95825, 0.97325],
    'Precision_0': [0.921656, 0.982444, 0.980013],
    'Recall_0': [0.906642, 0.911654, 0.952381],
    'F1_0': [0.914087, 0.945726, 0.965999],
    'Precision_1': [0.938683, 0.944025, 0.968967],
    'Recall_1': [0.948835, 0.989185, 0.987105],
    'F1_1': [0.943732, 0.966078, 0.977952],
    'Support_0': [1596.0, 1596.0, 1596.0],
    'Support_1': [2404.0, 2404.0, 2404.0]
}
class_df = pd.DataFrame(class_data)

# Define regression results from provided output
reg_data = {
    'Model': ['Linear Regression', 'Random Forest Regressor', 'Gradient Boosting Regressor'],
    'R2 Score': [0.365745, 0.455646, 0.473488],
    'MAE': [6.331399, 5.922787, 5.924666],
    'MSE': [66.723147, 57.265536, 55.388632]
}
reg_df = pd.DataFrame(reg_data)

# Classification visualization
metrics = ['Accuracy', 'Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1']
models = class_df['Model']
n_metrics = len(metrics)
n_models = len(models)

# Set up bar plot
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.25
index = np.arange(n_metrics)

# Plot bars for each model
for i, model in enumerate(models):
    scores = class_df.loc[class_df['Model'] == model, metrics].values.flatten()
    ax.bar(index + i * bar_width, scores, bar_width, label=model)

# Customize plot
ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Classification Model Performance (Class 0: 1,596 samples, Class 1: 2,404 samples)')
ax.set_xticks(index + bar_width * (n_models - 1) / 2)
ax.set_xticklabels(metrics, rotation=45)
ax.legend()
ax.set_ylim(0.85, 1.0)  # Focus on high scores
plt.tight_layout()

# Save plot
plt.savefig('classification_metrics.png')
plt.close()

# Regression visualization
reg_metrics = ['R2 Score', 'MAE', 'MSE']
n_reg_metrics = len(reg_metrics)

# Normalize MAE and MSE for visualization (scale to 0-1 for comparison)
reg_df['MAE_scaled'] = 1 - (reg_df['MAE'] - reg_df['MAE'].min()) / (reg_df['MAE'].max() - reg_df['MAE'].min())
reg_df['MSE_scaled'] = 1 - (reg_df['MSE'] - reg_df['MSE'].min()) / (reg_df['MSE'].max() - reg_df['MSE'].min())

# Set up bar plot
fig, ax = plt.subplots(figsize=(10, 6))
index = np.arange(n_reg_metrics)

# Plot bars for each model
for i, model in enumerate(reg_df['Model']):
    scores = reg_df.loc[reg_df['Model'] == model, ['R2 Score', 'MAE_scaled', 'MSE_scaled']].values.flatten()
    ax.bar(index + i * bar_width, scores, bar_width, label=model)

# Customize plot
ax.set_xlabel('Metrics')
ax.set_ylabel('Normalized Score')
ax.set_title('Regression Model Performance (R², Scaled MAE/MSE)')
ax.set_xticks(index + bar_width * (n_models - 1) / 2)
ax.set_xticklabels(['R² Score', 'MAE (scaled)', 'MSE (scaled)'])
ax.legend()
ax.set_ylim(0, 1.1)
plt.tight_layout()

# Save plot
plt.savefig('regression_metrics.png')
plt.close()

print("Visualizations saved as 'classification_metrics.png' and 'regression_metrics.png'")

Visualizations saved as 'classification_metrics.png' and 'regression_metrics.png'


In [None]:
!pip install kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Define classification results from provided output
# Define the data from the provided results
reg_data = {
    'Model': ['Linear Regression', 'Random Forest Regressor', 'Gradient Boosting Regressor'],
    'R2 Score': [0.949084, 0.989685, 0.997908],
    'MAE': [1.707430, 0.664289, 0.349442],
    'MSE': [5.356332, 1.085133, 0.220067]
}
class_data = {
    'Model': ['Logistic Regression', 'Random Forest Classifier', 'XGBoost Classifier'],
    'Accuracy': [0.97775, 0.96625, 0.99275],
    'Precision_0': [0.975995, 0.994584, 0.995572],
    'Recall_0': [0.968045, 0.920426, 0.986216],
    'F1_0': [0.972004, 0.956069, 0.990872],
    'Support_0': [1596.0, 1596.0, 1596.0],
    'Precision_1': [0.978899, 0.949663, 0.990905],
    'Recall_1': [0.984193, 0.996672, 0.997088],
    'F1_1': [0.981539, 0.972600, 0.993987],
    'Support_1': [2404.0, 2404.0, 2404.0]
}
reg_df = pd.DataFrame(reg_data)

# Approximate confusion matrices based on recall and support
# Logistic Regression: Class 0 errors ~149, Class 1 errors ~123
# Random Forest: Class 0 errors ~141, Class 1 errors ~26
# XGBoost: Class 0 errors ~76, Class 1 errors ~31
cm_lr = np.array([[1447, 149], [123, 2281]])  # Logistic Regression
cm_rf = np.array([[1455, 141], [26, 2378]])   # Random Forest
cm_xgb = np.array([[1520, 76], [31, 2373]])   # XGBoost
cms = {
    'Logistic Regression': cm_lr,
    'Random Forest Classifier': cm_rf,
    'XGBoost Classifier': cm_xgb
}

# 1. Interactive Classification Bar Plot
metrics = ['Accuracy', 'Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1']
plot_data = class_df.melt(id_vars='Model', value_vars=metrics, var_name='Metric', value_name='Score')

fig = px.bar(
    plot_data,
    x='Metric',
    y='Score',
    color='Model',
    barmode='group',
    title='Classification Model Performance (Class 0: 1,596, Class 1: 2,404)',
    color_discrete_sequence=px.colors.sequential.Plasma,
    height=600
)
fig.update_layout(
    yaxis_range=[0.85, 1.0],
    xaxis_title='Metrics',
    yaxis_title='Score',
    legend_title='Model',
    template='plotly_dark'
)
fig.write_html('classification_metrics.html') # Now this should work
fig.write_image('classification_metrics.png') # Using write_image instead of write_to_png for better compatibility

# 2. Confusion Matrix Heatmaps
fig = make_subplots(rows=1, cols=3, subplot_titles=list(cms.keys()), shared_yaxes=True)

for i, (model, cm) in enumerate(cms.items(), 1):
    fig.add_trace(
        go.Heatmap(
            z=cm,
            x=['Predicted 0', 'Predicted 1'],
            y=['True 0', 'True 1'],
            colorscale='Viridis',
            showscale=(i == 3),
            text=cm,
            texttemplate='%{text}',
            textfont=dict(size=12)
        ),
        row=1, col=i
    )

fig.update_layout(
    title='Confusion Matrices for Classification Models',
    height=400,
    width=1200,
    template='plotly_dark'
)
fig.write_html('confusion_matrices.html')
fig.write_image('confusion_matrices.png')

# 3. Radar Chart for Classification Metrics
fig = go.Figure()

for model in class_df['Model']:
    scores = class_df[class_df['Model'] == model][metrics].values.flatten()
    fig.add_trace(go.Scatterpolar(
        r=scores,
        theta=metrics,
        fill='toself',
        name=model,
        line=dict(width=2)
    ))

fig.update_layout(
    polar=dict(radialaxis=dict(range=[0.85, 1.0])),
    showlegend=True,
    title='Classification Models Comparison (Radar Chart)',
    template='plotly_dark',
    height=600
)
fig.write_html('radar_chart.html')
fig.write_image('radar_chart.png')

# 4. Interactive Regression Bar Plot
# Normalize MAE and MSE for visualization
reg_df['MAE_scaled'] = 1 - (reg_df['MAE'] - reg_df['MAE'].min()) / (reg_df['MAE'].max() - reg_df['MAE'].min())
reg_df['MSE_scaled'] = 1 - (reg_df['MSE'] - reg_df['MSE'].min()) / (reg_df['MSE'].max() - reg_df['MSE'].min())

reg_metrics = ['R2 Score', 'MAE_scaled', 'MSE_scaled']
reg_plot_data = reg_df.melt(id_vars='Model', value_vars=reg_metrics, var_name='Metric', value_name='Score')

fig = px.bar(
    reg_plot_data,
    x='Metric',
    y='Score',
    color='Model',
    barmode='group',
    title='Regression Model Performance (R², Scaled MAE/MSE)',
    color_discrete_sequence=px.colors.sequential.Inferno,
    height=600,
    text_auto='.2f'
)
fig.update_layout(
    yaxis_title='Normalized Score',
    xaxis_title='Metrics',
    legend_title='Model',
    template='plotly_dark',
    annotations=[
        dict(
            x=1, y=0.5, xref='x', yref='y',
            text=f'Raw MAE: LR={reg_df["MAE"][0]:.2f}, RF={reg_df["MAE"][1]:.2f}, GB={reg_df["MAE"][2]:.2f}',
            showarrow=False,
            font=dict(size=10)
        ),
        dict(
            x=2, y=0.5, xref='x', yref='y',
            text=f'Raw MSE: LR={reg_df["MSE"][0]:.2f}, RF={reg_df["MSE"][1]:.2f}, GB={reg_df["MSE"][2]:.2f}',
            showarrow=False,
            font=dict(size=10)
        )
    ]
)
fig.write_html('regression_metrics.html')
fig.write_image('regression_metrics.png')

# 5. Static Seaborn Plot (Fallback)
plt.figure(figsize=(14, 6))
sns.set_style('whitegrid')
sns.barplot(
    data=plot_data,
    x='Metric',
    y='Score',
    hue='Model',
    palette='viridis'
)
plt.title('Classification Model Performance (Static)', fontsize=14)
plt.ylim(0.85, 1.0)
plt.xticks(rotation=45)
plt.legend(title='Model')
plt.tight_layout()
plt.savefig('classification_metrics_static.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.barplot(
    data=reg_plot_data,
    x='Metric',
    y='Score',
    hue='Model',
    palette='magma'
)
plt.title('Regression Model Performance (Static, Scaled Metrics)', fontsize=14)
plt.ylim(0, 1.1)
plt.xticks(rotation=0)
plt.legend(title='Model')
plt.tight_layout()
plt.savefig('regression_metrics_static.png')
plt.close()

print("Interactive visualizations saved as HTML/PNG: 'classification_metrics', 'confusion_matrices', 'radar_chart', 'regression_metrics'")
print("Static visualizations saved as PNG: 'classification_metrics_static.png', 'regression_metrics_static.png'")

Interactive visualizations saved as HTML/PNG: 'classification_metrics', 'confusion_matrices', 'radar_chart', 'regression_metrics'
Static visualizations saved as PNG: 'classification_metrics_static.png', 'regression_metrics_static.png'


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import classification_report, accuracy_score
from category_encoders import TargetEncoder
import uuid

# Load dataset
try:
    df = pd.read_csv('employee_training_data_modified.csv')
    print(f"Loaded dataset with {len(df)} records.")
except FileNotFoundError:
    print("Error: 'employee_training_data_modified.csv' not found.")
    exit(1)
except Exception as e:
    print(f"Error loading data: {e}")
    exit(1)

# Define features, including Improvement (%)
features = [
    'Department', 'Training_Program', 'Pre_Training_Score', 'Engagement (hrs)',
    'Sentiment_Score', 'Manager_Support_Rating', 'Learning_Style',
    'Training_Difficulty', 'Feedback', 'Trainer_Quality', # Changed from 'Trainer dottor_Quality'
    'Engagement_Support_Score', 'Score_Progress_Indicator',
    'Training_Effectiveness', 'Motivation_Index', 'Improvement (%)'
]
categorical_cols = ['Department', 'Training_Program', 'Learning_Style', 'Training_Difficulty', 'Feedback']
numerical_cols = [
    'Pre_Training_Score', 'Engagement (hrs)', 'Sentiment_Score',
    'Manager_Support_Rating', 'Trainer_Quality', 'Engagement_Support_Score',
    'Score_Progress_Indicator', 'Training_Effectiveness', 'Motivation_Index',
    'Improvement (%)'
]

# Define targets
reg_target = 'Post_Training_Score'
class_target = 'High_Performance'

# Verify all required columns are present
missing_cols = [col for col in features + [reg_target, class_target] if col not in df.columns]
if missing_cols:
    print(f"Error: Missing columns in dataset: {missing_cols}")
    exit(1)

# Drop rows with missing values
df = df.dropna(subset=features + [reg_target, class_target])
print(f"Dataset after dropping missing values: {len(df)} records.")

# Split data
X = df[features]
y_reg = df[reg_target]
y_class = df[class_target]
X_train, X_test, y_reg_train, y_reg_test, y_class_train, y_class_test = train_test_split(
    X, y_reg, y_class, test_size=0.2, random_state=42
)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# Preprocessing pipeline
preprocessor = Pipeline([
    ('encoder', TargetEncoder(cols=categorical_cols)),
    ('scaler', StandardScaler())
])

# Define regression models
reg_models = {
    'Linear Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'Random Forest Regressor': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=100,
            max_depth=10,
            random_state=42
        ))
    ]),
    'Gradient Boosting Regressor': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ))
    ])
}

# Define classification models
class_models = {
    'Logistic Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(C=1.0, max_iter=1000, random_state=42))
    ]),
    'Random Forest Classifier': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        ))
    ]),
    'XGBoost Classifier': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            scale_pos_weight=1,
            eval_metric='logloss',
            random_state=42
        ))
    ])
}

# Train and evaluate regression models
reg_results = []
reg_feature_importances = []
for name, model in reg_models.items():
    print(f"Training {name}...")
    try:
        model.fit(X_train, y_reg_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_reg_test, y_pred)
        mae = mean_absolute_error(y_reg_test, y_pred)
        mse = mean_squared_error(y_reg_test, y_pred)
        reg_results.append({
            'Model': name,
            'R2 Score': r2,
            'MAE': mae,
            'MSE': mse
        })
        if name in ['Random Forest Regressor', 'Gradient Boosting Regressor']:
            importances = model.named_steps['regressor'].feature_importances_
            for feature, importance in zip(features, importances):
                reg_feature_importances.append({
                    'Model': name,
                    'Feature': feature,
                    'Importance': importance
                })
    except Exception as e:
        print(f"Error training {name}: {e}")

# Train and evaluate classification models
class_results = []
class_feature_importances = []
for name, model in class_models.items():
    print(f"Training {name}...")
    try:
        model.fit(X_train, y_class_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_class_test, y_pred)
        if accuracy < 0.95:
            print(f"Warning: {name} accuracy ({accuracy:.4f}) is below 95%.")
        report = classification_report(y_class_test, y_pred, output_dict=True, zero_division=0)
        class_results.append({
            'Model': name,
            'Accuracy': accuracy,
            'Precision_0': report['0']['precision'],
            'Recall_0': report['0']['recall'],
            'F1_0': report['0']['f1-score'],
            'Support_0': report['0']['support'],
            'Precision_1': report['1']['precision'],
            'Recall_1': report['1']['recall'],
            'F1_1': report['1']['f1-score'],
            'Support_1': report['1']['support']
        })
        if name in ['Random Forest Classifier', 'XGBoost Classifier']:
            importances = model.named_steps['classifier'].feature_importances_
            for feature, importance in zip(features, importances):
                class_feature_importances.append({
                    'Model': name,
                    'Feature': feature,
                    'Importance': importance
                })
    except Exception as e:
        print(f"Error training {name}: {e}")

# Create results DataFrames
reg_results_df = pd.DataFrame(reg_results)
class_results_df = pd.DataFrame(class_results)
reg_importances_df = pd.DataFrame(reg_feature_importances)
class_importances_df = pd.DataFrame(class_feature_importances)

# Save results to CSV
try:
    output_df = pd.concat([
        reg_results_df,
        pd.DataFrame([{}]),
        class_results_df,
        pd.DataFrame([{}]),
        reg_importances_df,
        pd.DataFrame([{}]),
        class_importances_df
    ], ignore_index=True)
    output_df.to_csv('model_results_with_improvement.csv', index=False)
    print("Results and feature importances saved to 'model_results_with_improvement.csv'")
except Exception as e:
    print(f"Error saving results: {e}")

# Print summary
print("\nRegression Model Performance:")
print(reg_results_df.to_string(index=True))
print("\nClassification Model Performance:")
print(class_results_df.to_string(index=True))


Loaded dataset with 20000 records.
Dataset after dropping missing values: 20000 records.
Training set size: 16000, Test set size: 4000
Training Linear Regression...
Training Random Forest Regressor...
Training Gradient Boosting Regressor...
Training Logistic Regression...
Training Random Forest Classifier...
Training XGBoost Classifier...
Results and feature importances saved to 'model_results_with_improvement.csv'

Regression Model Performance:
                         Model  R2 Score       MAE       MSE
0            Linear Regression  0.949084  1.707430  5.356332
1      Random Forest Regressor  0.989685  0.664289  1.085133
2  Gradient Boosting Regressor  0.997908  0.349442  0.220067

Classification Model Performance:
                      Model  Accuracy  Precision_0  Recall_0      F1_0  Support_0  Precision_1  Recall_1      F1_1  Support_1
0       Logistic Regression   0.97775     0.975995  0.968045  0.972004     1596.0     0.978899  0.984193  0.981539     2404.0
1  Random Forest Cl

In [None]:
!pip install -U kaleido



In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Define the data from the provided results
reg_data = {
    'Model': ['Linear Regression', 'Random Forest Regressor', 'Gradient Boosting Regressor'],
    'R2 Score': [0.949084, 0.989685, 0.997908],
    'MAE': [1.707430, 0.664289, 0.349442],
    'MSE': [5.356332, 1.085133, 0.220067]
}
class_data = {
    'Model': ['Logistic Regression', 'Random Forest Classifier', 'XGBoost Classifier'],
    'Accuracy': [0.97775, 0.96625, 0.99275],
    'Precision_0': [0.975995, 0.994584, 0.995572],
    'Recall_0': [0.968045, 0.920426, 0.986216],
    'F1_0': [0.972004, 0.956069, 0.990872],
    'Support_0': [1596.0, 1596.0, 1596.0],
    'Precision_1': [0.978899, 0.949663, 0.990905],
    'Recall_1': [0.984193, 0.996672, 0.997088],
    'F1_1': [0.981539, 0.972600, 0.993987],
    'Support_1': [2404.0, 2404.0, 2404.0]
}

reg_df = pd.DataFrame(reg_data)
class_df = pd.DataFrame(class_data)

# Set Seaborn style for better aesthetics
sns.set(style="whitegrid", palette="muted")

# --- Regression Visualizations ---

# 1. Plotly Bar Plot for Regression Metrics (Saved as PNG)
fig_reg = make_subplots(
    rows=1, cols=3,
    subplot_titles=("R² Score", "MAE", "MSE"),
    specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]]
)

fig_reg.add_trace(
    go.Bar(x=reg_df['Model'], y=reg_df['R2 Score'], name="R² Score", marker_color='rgb(55, 83, 109)'),
    row=1, col=1
)
fig_reg.add_trace(
    go.Bar(x=reg_df['Model'], y=reg_df['MAE'], name="MAE", marker_color='rgb(26, 118, 255)'),
    row=1, col=2
)
fig_reg.add_trace(
    go.Bar(x=reg_df['Model'], y=reg_df['MSE'], name="MSE", marker_color='rgb(50, 171, 96)'),
    row=1, col=3
)

fig_reg.update_layout(
    title_text="Regression Model Performance Comparison",
    showlegend=False,
    height=500,
    width=1200,
    template="plotly_white"
)
fig_reg.update_yaxes(title_text="Score", row=1, col=1)
fig_reg.update_yaxes(title_text="Error", row=1, col=2)
fig_reg.update_yaxes(title_text="Error", row=1, col=3)
fig_reg.write_image("regression_bar_plot.png", format="png", scale=3)  # 300 DPI equivalent
print("Saved regression bar plot to 'regression_bar_plot.png'")

# 2. Seaborn Heatmap for Regression Metrics
plt.figure(figsize=(8, 6))
reg_heatmap_data = reg_df.set_index('Model')[['R2 Score', 'MAE', 'MSE']]
sns.heatmap(reg_heatmap_data, annot=True, cmap='coolwarm', fmt=".4f", linewidths=0.5)
plt.title("Regression Metrics Heatmap")
plt.tight_layout()
plt.savefig("regression_heatmap.png", dpi=300)
plt.close()
print("Saved regression heatmap to 'regression_heatmap.png'")

# --- Classification Visualizations ---

# 1. Plotly Grouped Bar Plot for Classification Metrics (Saved as PNG)
fig_class = go.Figure()
metrics = ['Accuracy', 'Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1']
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692']

for metric, color in zip(metrics, colors):
    fig_class.add_trace(
        go.Bar(
            x=class_df['Model'],
            y=class_df[metric],
            name=metric,
            marker_color=color
        )
    )

fig_class.update_layout(
    title_text="Classification Model Performance Comparison",
    barmode='group',
    height=600,
    width=1200,
    template="plotly_white",
    yaxis_title="Score",
    legend_title="Metrics",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
)
fig_class.write_image("classification_bar_plot.png", format="png", scale=3)  # 300 DPI equivalent
print("Saved classification bar plot to 'classification_bar_plot.png'")

# 2. Plotly Radar Chart for Classification Metrics (Saved as PNG)
fig_radar = go.Figure()

for _, row in class_df.iterrows():
    fig_radar.add_trace(
        go.Scatterpolar(
            r=[row['Accuracy'], row['Precision_0'], row['Recall_0'], row['F1_0'],
               row['Precision_1'], row['Recall_1'], row['F1_1']],
            theta=['Accuracy', 'Precision (0)', 'Recall (0)', 'F1 (0)',
                   'Precision (1)', 'Recall (1)', 'F1 (1)'],
            fill='toself',
            name=row['Model']
        )
    )

fig_radar.update_layout(
    title_text="Classification Model Performance (Radar Chart)",
    polar=dict(radialaxis=dict(visible=True, range=[0.9, 1.0])),
    showlegend=True,
    height=600,
    width=800,
    template="plotly_white"
)
fig_radar.write_image("classification_radar_chart.png", format="png", scale=3)  # 300 DPI equivalent
print("Saved classification radar chart to 'classification_radar_chart.png'")

# 3. Seaborn Heatmap for Classification Metrics
plt.figure(figsize=(10, 6))
class_heatmap_data = class_df.set_index('Model')[['Accuracy', 'Precision_0', 'Recall_0', 'F1_0',
                                                 'Precision_1', 'Recall_1', 'F1_1']]
sns.heatmap(class_heatmap_data, annot=True, cmap='YlGnBu', fmt=".4f", linewidths=0.5)
plt.title("Classification Metrics Heatmap")
plt.tight_layout()
plt.savefig("classification_heatmap.png", dpi=300)
plt.close()
print("Saved classification heatmap to 'classification_heatmap.png'")

Saved regression bar plot to 'regression_bar_plot.png'
Saved regression heatmap to 'regression_heatmap.png'
Saved classification bar plot to 'classification_bar_plot.png'
Saved classification radar chart to 'classification_radar_chart.png'
Saved classification heatmap to 'classification_heatmap.png'
