# Data Preparation

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utils functions
import os
import sys

# Add the src directory to Python path to import our utils module
src_path = os.path.abspath('../../src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Import feature engineering utilities from our custom module
from utils import ( # type: ignore
    identify_feature_types,
    create_correlation_matrix,
    apply_binary_encoding,
    apply_onehot_encoding,
    apply_ordinal_encoding,
)

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# ============================================================================
# CONFIGURATION: TARGET VARIABLE
# ============================================================================
TARGET_VARIABLE = 'a_quitte_l_entreprise'  # Employee turnover prediction (0=stayed, 1=left)

print("✅ Libraries imported successfully!")
print(f"🎯 Target variable set to: {TARGET_VARIABLE}")

## 2. Load the Cleaned Dataset

Load the merged and cleaned dataset from the EDA phase.

In [None]:
# Load the cleaned merged dataset
data_path = '../../data/processed/employee_data_merged_clean.csv'

try:
    df = pd.read_csv(data_path)
    print(f"✅ Dataset loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
    print(f"🔍 Columns: {len(df.columns)} features")
    
    # Display basic info
    print("\n" + "="*60)
    print("DATASET OVERVIEW")
    print("="*60)
    print(df.info())
    
except FileNotFoundError:
    print(f"❌ File not found: {data_path}")
    print("Please ensure the EDA notebook has been run and the cleaned dataset exists.")

## 3. Feature Type Analysis

Analyze the dataset to identify different types of features automatically.

In [None]:
# Test if utils functions are available, if not provide a fallback
try:
    # Test if we can call a function from utils
    print("FEATURE TYPE ANALYSIS:")
    print("="*50)
    
    # Identify different types of features in the dataset
    feature_types = identify_feature_types(df, exclude_cols=[TARGET_VARIABLE])
    
    print("✅ Feature type analysis complete!")
    print(f"Feature type breakdown:")
    for feature_type, features in feature_types.items():
        print(f"  • {feature_type.replace('_', ' ').title()}: {len(features)} features")
        if features and len(features) <= 8:
            print(f"    - {', '.join(features)}")
        elif features:
            print(f"    - {', '.join(features[:5])} ... and {len(features)-5} more")
    
    
except NameError:
    print("⚠️ Utils functions not available. Please run the import cell first.")
    print("If the import fails, you may need to:")
    print("1. Check that src/utils.py exists")
    print("2. Restart the kernel and run cells in order")
    print("3. Use absolute import path")
    
    # Create a basic feature_types dictionary as fallback
    feature_types = {
        'numerical_continuous': [],
        'numerical_discrete': [],
        'categorical_ordinal': [],
        'categorical_nominal': [],
        'binary': [],
        'id_columns': []
    }
    
    # Manual feature type identification as fallback
    for col in df.columns:
        if col == TARGET_VARIABLE:
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].nunique() == 2:
                feature_types['binary'].append(col)
            elif df[col].nunique() <= 10:
                feature_types['categorical_ordinal'].append(col)
            else:
                feature_types['numerical_continuous'].append(col)
        else:
            feature_types['categorical_nominal'].append(col)
    
    print("📋 Fallback feature type analysis:")
    for feature_type, features in feature_types.items():
        if features:
            print(f"  • {feature_type.replace('_', ' ').title()}: {len(features)} features")

## 4. Feature Encoding and Preprocessing

Apply appropriate encoding methods for categorical features and prepare data for modeling.

In [None]:
# Identify categorical features for encoding
categorical_features = df.select_dtypes(exclude=['number', 'boolean']).columns.tolist()
print(categorical_features)

In [None]:
# One-hot encoding for 'departement' column using our utils function
one_hot_encoding_features = ['departement', 'statut_marital', 'poste']

for feature in one_hot_encoding_features:
	df, encoding_info = apply_onehot_encoding(df, [feature])

df.head()
    

In [None]:
# Binary encoding for 'genre' column
df["genre"].value_counts()
df, encoding_info = apply_binary_encoding(df, ['genre'], {'genre': {'M': 0, 'F': 1}})
df.head()

In [None]:
# Binary encoding for 'heure_supplementaires' column
df["heure_supplementaires"].value_counts()
df, encoding_info = apply_binary_encoding(df, ["heure_supplementaires"], {"heure_supplementaires": {"Non": 0, "Oui": 1}})
df.head()

In [None]:
# Binary encoding for 'a_quitte_l_entreprise' column
df[TARGET_VARIABLE].value_counts()
df, encoding_info = apply_binary_encoding(df, [TARGET_VARIABLE], {TARGET_VARIABLE: {"Non": 0, "Oui": 1}})
df.head()

In [None]:
# Convert 'augementation_salaire_precedente' from percentage string to float
df["augementation_salaire_precedente"].value_counts()
df["augementation_salaire_precedente"] = df["augementation_salaire_precedente"].str.rstrip(" %").astype(float)
df.head()

In [None]:
# One-hot encoding for 'domaine_etude' column
df["domaine_etude"].value_counts()
df, encoding_info = apply_onehot_encoding(
    df, ["domaine_etude"]
)
df.head()

In [None]:
# Ordinal encoding for 'frequence_deplacement' column
df["frequence_deplacement"].value_counts()
df, encoding_info = apply_ordinal_encoding(df, ["frequence_deplacement"], {"frequence_deplacement": ["Aucun", "Occasionnel", "Frequent"]})
df.head()

In [None]:
# Check that all categorical features have been encoded
remaining_categorical_features = df.select_dtypes(exclude=['number', 'boolean']).columns.tolist()
if not remaining_categorical_features:
	print("✅ All categorical features have been encoded.")

## 5. Prepare Features and Target Variable



In [None]:
# Prepare features and target
print("PREPARING FINAL FEATURES AND TARGET:")
print("="*50)


# Drop target and ID columns from features
y = df[TARGET_VARIABLE].copy()
X_temp = df.drop(columns=[TARGET_VARIABLE, "id_employee"])  


print(f"Features shape before correlation filtering: {X_temp.shape}")
print(f"Target shape: {y.shape}")
print(f"Target variable: {TARGET_VARIABLE}")

## 5.1. Feature Engineering

Apply feature engineering transformations to create new features and improve model performance.

In [None]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform feature engineering on the given DataFrame.

    Parameters:
    df (pd.DataFrame): Input DataFrame with raw features.

    Returns:
    pd.DataFrame: DataFrame with engineered features.
    """

    # Mobilité interne ratio
    epsilon = 1e-6
    df["mobilite_interne_ratio"] = df["annees_dans_le_poste_actuel"] / (df["annees_dans_l_entreprise"] + epsilon)
    # df.drop(columns=["annees_dans_le_poste_actuel"], inplace=True)

    # Ratio ancienneté
    df["ratio_anciennete"] = df["annees_dans_l_entreprise"] / (df["annees_dans_l_entreprise"] + df["nombre_experiences_precedentes"] + epsilon)

    # Evolution de la note d'évaluation
    df["delta_evaluation"] = df["note_evaluation_actuelle"] - df["note_evaluation_precedente"]
    df.drop(columns=["note_evaluation_precedente"], inplace=True)

    # Ecart salarial = revenu de l'employé / revenu moyen au même poste pour le me

    return df

# Apply feature engineering
print("APPLYING FEATURE ENGINEERING:")
print("="*50)

print(f"Features shape before feature engineering: {X_temp.shape}")
X_temp = feature_engineering(X_temp)
print(f"Features shape after feature engineering: {X_temp.shape}")

print(f"\n✅ Feature engineering completed!")

X_temp["ratio_anciennete"].isnull().sum()

## 6. Feature Correlation Analysis (Post Feature Engineering)

Analyze correlations between the final engineered features (X only) to identify multicollinearity issues.

In [None]:
# Prepare features for correlation analysis (numerical features only)
print("FEATURE CORRELATION ANALYSIS ON FINAL X FEATURES:")
print("="*60)

# Get only numerical features from X for correlation analysis
numerical_cols_in_X = [col for col in X_temp.columns if pd.api.types.is_numeric_dtype(X_temp[col])]
print(f"Analyzing correlations for {len(numerical_cols_in_X)} numerical features:")

# Show feature breakdown
if len(numerical_cols_in_X) <= 15:
    for col in numerical_cols_in_X:
        print(f"  - {col}")
else:
    for col in numerical_cols_in_X[:10]:
        print(f"  - {col}")
    print(f"  ... and {len(numerical_cols_in_X) - 10} more features")

print(f"\nNote: Excluding target variable '{TARGET_VARIABLE}' from correlation analysis")
print(f"Features include encoded categorical variables (one-hot encoded columns)")

# Create subset for correlation analysis
X_numerical_for_corr = X_temp[numerical_cols_in_X].copy()
print(f"\nCorrelation analysis dataset shape: {X_numerical_for_corr.shape}")

In [None]:
# Features to potentially drop based on correlation analysis
features_to_potentially_drop = [
    # "niveau_hierarchique_poste", # Not really relevant in ESN context
    # "annees_dans_le_poste_actuel",
    # "annes_sous_responsable_actuel",
    "departement_Consulting",
    "departement_Ressources Humaines",
]

# Drop from X_numerical_for_corr
X_numerical_for_corr = X_numerical_for_corr.drop(columns=features_to_potentially_drop)

In [None]:
# Spearman correlation analysis on final features
print("SPEARMAN CORRELATION ANALYSIS (FINAL FEATURES):")
print("="*50)

spearman_corr, spearman_high_corr = create_correlation_matrix(
    X_numerical_for_corr, method='spearman', threshold=0.7
)

print(f"Highly correlated feature pairs (|correlation| >= 0.7):")
if spearman_high_corr:
    for pair in spearman_high_corr:
        print(f"  - {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f}")
else:
    print("  - No highly correlated pairs found")

# Visualize Spearman correlation matrix
plt.figure(figsize=(16, 12))
mask = np.triu(np.ones_like(spearman_corr, dtype=bool))
sns.heatmap(spearman_corr, mask=mask, annot=True, cmap='coolwarm', 
            center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            fmt='.1f', annot_kws={'size': 8})
plt.title('Spearman Correlation Matrix (Final Encoded Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"\n📊 Spearman correlation matrix shape: {spearman_corr.shape}")


In [None]:
# Pearson correlation analysis on final features
print("PEARSON CORRELATION ANALYSIS (FINAL FEATURES):")
print("="*50)

pearson_corr, pearson_high_corr = create_correlation_matrix(
    X_numerical_for_corr, method='pearson', threshold=0.7
)

print(f"Highly correlated feature pairs (|correlation| >= 0.7):")
if pearson_high_corr:
    for pair in pearson_high_corr:
        print(f"  - {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f}")
else:
    print("  - No highly correlated pairs found")

# Visualize Pearson correlation matrix
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(pearson_corr, dtype=bool))
sns.heatmap(pearson_corr, mask=mask, annot=True, cmap='coolwarm', 
            center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            fmt='.2f', annot_kws={'size': 8})
plt.title('Feature Correlation Matrix (Final Encoded Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"\n📊 Correlation matrix shape: {pearson_corr.shape}")

In [None]:
# All high correlation pairs from both analyses
all_high_corr_pairs = spearman_high_corr + pearson_high_corr
if all_high_corr_pairs:
	print(f"\nTotal unique highly correlated feature pairs from both analyses: {len(all_high_corr_pairs)}")
	for pair in all_high_corr_pairs:
		print(f"  - {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f}")

In [None]:
# Combined correlation analysis summary on engineered features
print("FINAL CORRELATION ANALYSIS (POST FEATURE ENGINEERING):")
print("="*60)
print("Analyzing correlations on the engineered feature set...")

numerical_cols_in_X_temp = [
    col for col in X_temp.columns if pd.api.types.is_numeric_dtype(X_temp[col])
]
print(f"Features after engineering: {len(numerical_cols_in_X_temp)} numerical features")

if len(numerical_cols_in_X_temp) > 1:
    total_features = len(numerical_cols_in_X_temp)

    # Combine high correlation pairs from both methods
    all_high_corr_pairs = []
    
    # Add Spearman pairs if available
    if 'spearman_high_corr' in locals() and spearman_high_corr:
        for pair in spearman_high_corr:
            pair['method'] = 'Spearman'
            all_high_corr_pairs.append(pair)
    
    # Add Pearson pairs if available
    if 'pearson_high_corr' in locals() and pearson_high_corr:
        for pair in pearson_high_corr:
            pair['method'] = 'Pearson'
            all_high_corr_pairs.append(pair)
    
    # Remove duplicates (same feature pair found by both methods)
    unique_pairs = {}
    for pair in all_high_corr_pairs:
        key = tuple(sorted([pair['feature1'], pair['feature2']]))
        if key not in unique_pairs:
            unique_pairs[key] = pair
        else:
            # Keep the one with higher absolute correlation
            if abs(pair['correlation']) > abs(unique_pairs[key]['correlation']):
                unique_pairs[key] = pair
    
    unique_high_corr_pairs = list(unique_pairs.values())
    
    print(f"📊 Total features analyzed: {total_features}")
    print(f"🔗 High correlation pairs found (≥0.7):")
    print(f"   - Spearman method: {len([p for p in all_high_corr_pairs if p.get('method') == 'Spearman'])}")
    print(f"   - Pearson method: {len([p for p in all_high_corr_pairs if p.get('method') == 'Pearson'])}")
    print(f"   - Unique pairs (combined): {len(unique_high_corr_pairs)}")
    
    if unique_high_corr_pairs:
        print(f"\n⚠️  MULTICOLLINEARITY DETECTED:")
        print(f"   Found {len(unique_high_corr_pairs)} unique pairs of highly correlated features")
        print(f"   These may cause issues in linear models")
        print(f"   Consider manually removing one feature from each pair")
        
        print(f"\n📋 Highly correlated feature pairs:")
        for pair in unique_high_corr_pairs[:10]:  # Show top 10
            print(f"   - {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f} ({pair['method']})")
        
        if len(unique_high_corr_pairs) > 10:
            print(f"   ... and {len(unique_high_corr_pairs) - 10} more pairs")
        
        # Show which features appear most frequently in correlations
        feature_counts = {}
        for pair in unique_high_corr_pairs:
            feature_counts[pair['feature1']] = feature_counts.get(pair['feature1'], 0) + 1
            feature_counts[pair['feature2']] = feature_counts.get(pair['feature2'], 0) + 1
        
        if feature_counts:
            print(f"\n🎯 Features involved in multiple correlations:")
            sorted_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
            for feature, count in sorted_features[:5]:  # Top 5
                print(f"   - {feature}: {count} correlation(s)")
    else:
        print(f"\n✅ NO MULTICOLLINEARITY ISSUES:")
        print(f"   All feature correlations are below 0.7 threshold")
        print(f"   Features are suitable for linear models")
        
    # Calculate max correlation from both methods
    max_corrs = []
    if 'spearman_corr' in locals():
        spearman_max = spearman_corr.abs().values
        spearman_max = spearman_max[spearman_max < 1.0].max()
        max_corrs.append(('Spearman', spearman_max))
    
    if 'pearson_corr' in locals():
        pearson_max = pearson_corr.abs().values
        pearson_max = pearson_max[pearson_max < 1.0].max()
        max_corrs.append(('Pearson', pearson_max))
    
    if max_corrs:
        print(f"\n📈 Maximum absolute correlations:")
        for method, max_corr in max_corrs:
            print(f"   - {method}: {max_corr:.3f}")
else:
    print("❌ Could not perform correlation analysis")

print(f"\n💡 Next step: Manually review and decide on feature selection if needed")
print(f"   Both Spearman and Pearson analyses provide complementary insights")

In [None]:
# Prepare final X and y for modeling
print("PREPARING FINAL X AND Y:")
print("="*50)

# Use X_temp as final X (no automatic correlation removal)
X = X_temp.copy()

print(f"Final dataset dimensions:")
print(f"  - X (features): {X.shape}")
print(f"  - y (target): {y.shape}")
print(f"  - Total features: {X.shape[1]}")

print(f"\n✅ X and y ready for modeling!")

## 7. Export Processed Data

Save the final preprocessed features (X) and target (y) for modeling.

In [None]:
# Export processed data for modeling
print("EXPORTING PROCESSED DATA:")
print("="*50)

# Create data folder for processed data
import os
processed_data_path = "../../data/processed"
os.makedirs(processed_data_path, exist_ok=True)

# Export features (X) and target (y)
X_file = os.path.join(processed_data_path, "X_features.csv")
y_file = os.path.join(processed_data_path, "y_target.csv")

X.to_csv(X_file, index=False)
y.to_csv(y_file, index=False)

print(f"✅ Features saved to: {X_file}")
print(f"   Shape: {X.shape}")
print(f"   Columns: {list(X.columns)}")

print(f"\n✅ Target saved to: {y_file}")
print(f"   Shape: {y.shape}")
print(f"   Target variable: {TARGET_VARIABLE}")

print(f"\n📊 Final dataset summary:")
print(f"   Total samples: {len(X)}")
print(f"   Total features: {X.shape[1]}")
print(f"   Target classes: {sorted(y.unique())}")

print(f"\n🎯 Ready for modeling in next notebook!")