# Phase 3: Baseline Modeling - TechNova Partners Turnover Analysis

**Objective**: Establish baseline model performance using simple train/test split and fundamental algorithms.

**Models to evaluate**:
1. **Dummy Classifier** (baseline)
2. **Logistic Regression** (linear model)
3. **Random Forest** (non-linear tree model)

**Metrics**: Confusion matrix, precision, recall, F1-score, classification report

---

## 1. Environment Setup & Data Loading

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# System libraries
from pathlib import Path
import pickle

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
np.random.seed(42)

print("Environment setup complete")

In [None]:
# Setup robust path handling and load data
import sys
from pathlib import Path

# Add project root to Python path and setup environment
current_dir = Path.cwd()
project_root = current_dir
while project_root != project_root.parent:
    if (project_root / 'pyproject.toml').exists() or (project_root / 'hr_analytics_utils.py').exists():
        break
    project_root = project_root.parent

sys.path.insert(0, str(project_root))

# Import utilities and setup environment
from hr_analytics_utils import setup_notebook_environment, load_modeling_data_from_db, print_database_status

# Setup environment
env_info = setup_notebook_environment()

# Check database status first
print_database_status()

# Load features and target from database using robust paths
X, y = load_modeling_data_from_db()

if X is None or y is None:
    raise ValueError("Could not load data from database. Please ensure notebook 2 has been executed.")

print(f"\nData loaded successfully:")
print(f"   Features shape: {X.shape}")
print(f"   Target shape: {y.shape}")
print(f"   Target distribution: {y.value_counts().to_dict()}")

# Check feature types
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"\nFeature types:")
print(f"   Numeric features: {len(numeric_features)}")
print(f"   Categorical features: {len(categorical_features)}")

# Check for missing values
print(f"\nData quality check:")
print(f"   Missing values in X: {X.isnull().sum().sum()}")
print(f"   Missing values in y: {y.isnull().sum()}")
print(f"   Duplicate rows: {X.duplicated().sum()}")

## 2. Data Splitting & Preprocessing

In [None]:
# Split data into train and test sets
print(" Splitting data into train/test sets:")
print("=" * 40)

# Stratified split to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Data split completed:")
print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Test set: {X_test.shape[0]} samples")
print(f"   Features: {X_train.shape[1]}")

# Check class distribution in both sets
print(f"\nClass distribution:")
print(f"   Training - Stay: {(y_train == 0).sum()}, Leave: {(y_train == 1).sum()}")
print(f"   Test - Stay: {(y_test == 0).sum()}, Leave: {(y_test == 1).sum()}")
print(f"   Training turnover rate: {y_train.mean():.2%}")
print(f"   Test turnover rate: {y_test.mean():.2%}")

In [None]:
# Preprocessing - handle categorical variables and scaling
print("Preprocessing data:")
print("=" * 20)

# Separate numerical and categorical features
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Handle categorical variables with one-hot encoding if any exist
if categorical_features:
    print("\n📝 Encoding categorical variables...")
    X_train_encoded = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)
    
    # Align columns between train and test sets
    X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)
else:
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()

print(f"\nPreprocessing complete:")
print(f"   Final feature count: {X_train_encoded.shape[1]}")
print(f"   Training set shape: {X_train_encoded.shape}")
print(f"   Test set shape: {X_test_encoded.shape}")

# Store processed data
X_train_final = X_train_encoded
X_test_final = X_test_encoded

## 3. Baseline Models

In [None]:
# 1. Dummy Classifier (Baseline)
print("Dummy Classifier (Baseline):")
print("=" * 35)

# Create dummy classifier with different strategies
dummy_strategies = ['most_frequent', 'prior', 'stratified']
dummy_results = {}

for strategy in dummy_strategies:
    dummy_clf = DummyClassifier(strategy=strategy, random_state=42)
    dummy_clf.fit(X_train_final, y_train)
    dummy_pred = dummy_clf.predict(X_test_final)
    
    # Calculate metrics
    precision = precision_score(y_test, dummy_pred, average='binary')
    recall = recall_score(y_test, dummy_pred, average='binary')
    f1 = f1_score(y_test, dummy_pred, average='binary')
    
    dummy_results[strategy] = {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"\n{strategy.capitalize()} strategy:")
    print(f"   Precision: {precision:.4f}")
    print(f"   Recall: {recall:.4f}")
    print(f"   F1-score: {f1:.4f}")

# Use the best dummy classifier as baseline
best_dummy_strategy = max(dummy_results, key=lambda x: dummy_results[x]['f1'])
dummy_baseline = DummyClassifier(strategy=best_dummy_strategy, random_state=42)
dummy_baseline.fit(X_train_final, y_train)
dummy_pred = dummy_baseline.predict(X_test_final)

print(f"\nBest dummy strategy: {best_dummy_strategy}")
print(f"   This will be our baseline to beat.")

In [None]:
# 2. Logistic Regression (Linear Model)
print("Logistic Regression:")
print("=" * 25)

# Scale features for logistic regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

# Create and train logistic regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Make predictions
log_reg_pred = log_reg.predict(X_test_scaled)
log_reg_pred_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
log_reg_precision = precision_score(y_test, log_reg_pred)
log_reg_recall = recall_score(y_test, log_reg_pred)
log_reg_f1 = f1_score(y_test, log_reg_pred)
log_reg_auc = roc_auc_score(y_test, log_reg_pred_proba)

print(f"\nLogistic Regression Results:")
print(f"   Precision: {log_reg_precision:.4f}")
print(f"   Recall: {log_reg_recall:.4f}")
print(f"   F1-score: {log_reg_f1:.4f}")
print(f"   AUC-ROC: {log_reg_auc:.4f}")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'feature': X_train_final.columns,
    'coefficient': log_reg.coef_[0],
    'abs_coefficient': np.abs(log_reg.coef_[0])
}).sort_values('abs_coefficient', ascending=False)

print(f"\nTop 10 most important features (by coefficient magnitude):")
for i, row in feature_importance.head(10).iterrows():
    print(f"   {row['feature']:<30} {row['coefficient']:>8.4f}")

In [None]:
# 3. Random Forest (Non-linear Tree Model)
print("Random Forest:")
print("=" * 17)

# Create and train random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_final, y_train)

# Make predictions
rf_pred = rf.predict(X_test_final)
rf_pred_proba = rf.predict_proba(X_test_final)[:, 1]

# Calculate metrics
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_pred_proba)

print(f"\nRandom Forest Results:")
print(f"   Precision: {rf_precision:.4f}")
print(f"   Recall: {rf_recall:.4f}")
print(f"   F1-score: {rf_f1:.4f}")
print(f"   AUC-ROC: {rf_auc:.4f}")

# Feature importance
rf_feature_importance = pd.DataFrame({
    'feature': X_train_final.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 10 most important features (by Random Forest importance):")
for i, row in rf_feature_importance.head(10).iterrows():
    print(f"   {row['feature']:<30} {row['importance']:>8.4f}")

## 4. Model Comparison & Evaluation

In [None]:
# Compare all models
print(" Model Comparison:")
print("=" * 20)

# Create comparison dataframe
results_df = pd.DataFrame({
    'Model': ['Dummy (Baseline)', 'Logistic Regression', 'Random Forest'],
    'Precision': [
        dummy_results[best_dummy_strategy]['precision'],
        log_reg_precision,
        rf_precision
    ],
    'Recall': [
        dummy_results[best_dummy_strategy]['recall'],
        log_reg_recall,
        rf_recall
    ],
    'F1-Score': [
        dummy_results[best_dummy_strategy]['f1'],
        log_reg_f1,
        rf_f1
    ],
    'AUC-ROC': [
        0.5,  # Dummy classifier AUC is 0.5
        log_reg_auc,
        rf_auc
    ]
})

print(results_df.to_string(index=False, float_format='%.4f'))

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

metrics = ['Precision', 'Recall', 'F1-Score', 'AUC-ROC']
for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    bars = ax.bar(results_df['Model'], results_df[metric])
    ax.set_title(f'{metric} Comparison')
    ax.set_ylabel(metric)
    ax.set_ylim(0, 1)
    
    # Add value labels on bars
    for bar, value in zip(bars, results_df[metric]):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.3f}', ha='center', va='bottom')
    
    # Rotate x-axis labels for better readability
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Confusion matrices for all models
print("Confusion Matrices:")
print("=" * 22)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Dummy classifier confusion matrix
dummy_cm = confusion_matrix(y_test, dummy_pred)
sns.heatmap(dummy_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Dummy Classifier')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_xticklabels(['Stay', 'Leave'])
axes[0].set_yticklabels(['Stay', 'Leave'])

# Logistic regression confusion matrix
log_reg_cm = confusion_matrix(y_test, log_reg_pred)
sns.heatmap(log_reg_cm, annot=True, fmt='d', cmap='Blues', ax=axes[1])
axes[1].set_title('Logistic Regression')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_xticklabels(['Stay', 'Leave'])
axes[1].set_yticklabels(['Stay', 'Leave'])

# Random forest confusion matrix
rf_cm = confusion_matrix(y_test, rf_pred)
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues', ax=axes[2])
axes[2].set_title('Random Forest')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('Actual')
axes[2].set_xticklabels(['Stay', 'Leave'])
axes[2].set_yticklabels(['Stay', 'Leave'])

plt.tight_layout()
plt.show()

# Print detailed confusion matrix statistics
models = ['Dummy', 'Logistic Regression', 'Random Forest']
cms = [dummy_cm, log_reg_cm, rf_cm]

for model, cm in zip(models, cms):
    tn, fp, fn, tp = cm.ravel()
    print(f"\n{model}:")
    print(f"  True Negatives (Stay correctly predicted): {tn}")
    print(f"  False Positives (Stay predicted as Leave): {fp}")
    print(f"  False Negatives (Leave predicted as Stay): {fn}")
    print(f"  True Positives (Leave correctly predicted): {tp}")
    print(f"  Accuracy: {(tp + tn) / (tp + tn + fp + fn):.4f}")

In [None]:
# Detailed classification reports
print("Classification Reports:")
print("=" * 27)

models = ['Dummy Classifier', 'Logistic Regression', 'Random Forest']
predictions = [dummy_pred, log_reg_pred, rf_pred]

for model, pred in zip(models, predictions):
    print(f"\n{model}:")
    print("="*50)
    print(classification_report(y_test, pred, target_names=['Stay', 'Leave']))

In [None]:
from sklearn.metrics import precision_recall_curve

# ROC curves
print("ROC Curves:")
print("=" * 13)

plt.figure(figsize=(12, 8))

# Logistic Regression ROC
fpr_lr, tpr_lr, _ = roc_curve(y_test, log_reg_pred_proba)
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {log_reg_auc:.3f})', linewidth=2)

# Random Forest ROC
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_pred_proba)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {rf_auc:.3f})', linewidth=2)

# Diagonal line (random classifier)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.500)', linewidth=1)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

# Precision-Recall curves
plt.figure(figsize=(12, 8))

# Logistic Regression PR
precision_lr, recall_lr, _ = precision_recall_curve(y_test, log_reg_pred_proba)
plt.plot(recall_lr, precision_lr, label=f'Logistic Regression', linewidth=2)

# Random Forest PR
precision_rf, recall_rf, _ = precision_recall_curve(y_test, rf_pred_proba)
plt.plot(recall_rf, precision_rf, label=f'Random Forest', linewidth=2)

# Baseline (proportion of positive class)
baseline_precision = y_test.mean()
plt.axhline(y=baseline_precision, color='k', linestyle='--', label=f'Baseline ({baseline_precision:.3f})', linewidth=1)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves - Model Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Model Insights & Feature Importance

In [None]:
# Feature importance comparison
print("Feature Importance Analysis:")
print("=" * 32)

# Combine feature importance from both models
importance_comparison = pd.DataFrame({
    'feature': X_train_final.columns,
    'logistic_coef': np.abs(log_reg.coef_[0]),
    'rf_importance': rf.feature_importances_
})

# Normalize importance scores for comparison
importance_comparison['logistic_coef_norm'] = importance_comparison['logistic_coef'] / importance_comparison['logistic_coef'].max()
importance_comparison['rf_importance_norm'] = importance_comparison['rf_importance'] / importance_comparison['rf_importance'].max()

# Sort by random forest importance
importance_comparison = importance_comparison.sort_values('rf_importance', ascending=False)

print("Top 15 features by Random Forest importance:")
print(importance_comparison.head(15)[['feature', 'rf_importance', 'logistic_coef']].to_string(index=False, float_format='%.4f'))

# Visualize top features
top_features = importance_comparison.head(15)

fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Random Forest importance
axes[0].barh(range(len(top_features)), top_features['rf_importance'])
axes[0].set_yticks(range(len(top_features)))
axes[0].set_yticklabels(top_features['feature'])
axes[0].set_xlabel('Random Forest Importance')
axes[0].set_title('Top 15 Features - Random Forest')

# Logistic Regression coefficients
axes[1].barh(range(len(top_features)), top_features['logistic_coef'])
axes[1].set_yticks(range(len(top_features)))
axes[1].set_yticklabels(top_features['feature'])
axes[1].set_xlabel('Absolute Coefficient Value')
axes[1].set_title('Top 15 Features - Logistic Regression')

plt.tight_layout()
plt.show()

In [None]:
# Overall model evaluation summary
print("MODEL EVALUATION SUMMARY:")
print("=" * 40)

print(f"\n1. DATASET CHARACTERISTICS:")
print(f"   - Total samples: {len(y)}")
print(f"   - Training samples: {len(y_train)}")
print(f"   - Test samples: {len(y_test)}")
print(f"   - Features: {X_train_final.shape[1]}")
print(f"   - Class imbalance: {(y == 0).sum()}:{(y == 1).sum()} (Stay:Leave)")

print(f"\n2. MODEL PERFORMANCE RANKING (by F1-Score):")
performance_ranking = results_df.sort_values('F1-Score', ascending=False)
for i, (_, row) in enumerate(performance_ranking.iterrows()):
    print(f"   {i+1}. {row['Model']}: F1={row['F1-Score']:.4f}, Precision={row['Precision']:.4f}, Recall={row['Recall']:.4f}")

print(f"\n3. BEST MODEL ANALYSIS:")
best_model = performance_ranking.iloc[0]
print(f"   - Best performing model: {best_model['Model']}")
print(f"   - F1-Score: {best_model['F1-Score']:.4f}")
print(f"   - Precision: {best_model['Precision']:.4f}")
print(f"   - Recall: {best_model['Recall']:.4f}")
print(f"   - AUC-ROC: {best_model['AUC-ROC']:.4f}")

print(f"\n4. IMPROVEMENT OVER BASELINE:")
baseline_f1 = dummy_results[best_dummy_strategy]['f1']
best_f1 = performance_ranking.iloc[0]['F1-Score']
improvement = (best_f1 - baseline_f1) / baseline_f1 * 100
print(f"   - Baseline F1-Score: {baseline_f1:.4f}")
print(f"   - Best F1-Score: {best_f1:.4f}")
print(f"   - Improvement: {improvement:.1f}%")

print(f"\n5. KEY INSIGHTS:")
print(f"   - Both ML models significantly outperform the dummy baseline")
print(f"   - Random Forest shows {'better' if rf_f1 > log_reg_f1 else 'similar' if abs(rf_f1 - log_reg_f1) < 0.01 else 'worse'} performance than Logistic Regression")
print(f"   - AUC-ROC values indicate {'good' if max(rf_auc, log_reg_auc) > 0.8 else 'moderate' if max(rf_auc, log_reg_auc) > 0.7 else 'fair'} discriminative ability")
print(f"   - Class imbalance may be affecting performance - consider addressing in next phase")

In [None]:
# Save baseline results to database
from hr_analytics_utils import save_model_results_to_db, save_feature_importance_to_db

print("SAVING RESULTS TO DATABASE")
print("=" * 35)

# Save model comparison results to database
save_model_results_to_db(results_df, 'baseline_model_results', "../results/technova_hr.db")

# Save feature importance to database
save_feature_importance_to_db(importance_comparison, 'baseline_feature_importance', "../results/technova_hr.db")

# Save predictions for further analysis
predictions_df = pd.DataFrame({
    'actual': y_test,
    'dummy_pred': dummy_pred,
    'logistic_pred': log_reg_pred,
    'rf_pred': rf_pred,
    'logistic_proba': log_reg_pred_proba,
    'rf_proba': rf_pred_proba
})

# Save predictions to database
save_model_results_to_db(predictions_df, 'baseline_predictions', "../results/technova_hr.db")

print(f"\nAll baseline modeling results saved to database!")