# Customer Churn Prediction - Feature Engineering

This notebook implements feature engineering techniques to create predictive variables for the customer churn prediction model.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import StandardScaler

# Set up plotting
%matplotlib inline
plt.style.use('seaborn-whitegrid')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Create directories for saving outputs
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../models', exist_ok=True)

## 1. Load the Cleaned Data

In [None]:
# Load the cleaned data
df_cleaned = pd.read_csv('../data/cleaned/churn_cleaned.csv')

# Display basic information
print(f"Dataset shape: {df_cleaned.shape}")
print(f"\nColumns: {df_cleaned.columns.tolist()}")
print(f"\nSample data:")
df_cleaned.head()

## 2. Create Base Features

In [None]:
# Create a copy of the dataframe for feature engineering
df_engineered = df_cleaned.copy()

# One-hot encode categorical variables
df_engineered = pd.get_dummies(df_engineered, columns=['Geography', 'Gender'], drop_first=False)

## 3. Create Age-related Features

In [None]:
# Create age-related features
df_engineered['AgeGroup'] = pd.cut(df_engineered['Age'], bins=[0, 30, 40, 50, 60, 100], labels=[0, 1, 2, 3, 4]).astype(int)
df_engineered['IsYoung'] = (df_engineered['Age'] < 30).astype(int)
df_engineered['IsMiddleAged'] = ((df_engineered['Age'] >= 30) & (df_engineered['Age'] < 50)).astype(int)
df_engineered['IsSenior'] = (df_engineered['Age'] >= 50).astype(int)
df_engineered['IsRetirementAge'] = (df_engineered['Age'] >= 65).astype(int)

# Display the new features
age_features = ['Age', 'AgeGroup', 'IsYoung', 'IsMiddleAged', 'IsSenior', 'IsRetirementAge']
df_engineered[age_features].head(10)

## 4. Create Geography-related Features

In [None]:
# Create geography-related features
df_engineered['GermanyXAge'] = df_engineered['Geography_Germany'] * df_engineered['Age']
df_engineered['GermanyXBalance'] = df_engineered['Geography_Germany'] * df_engineered['Balance']
df_engineered['GermanyXSenior'] = df_engineered['Geography_Germany'] * df_engineered['IsSenior']

# Display the new features
geography_features = ['Geography_France', 'Geography_Germany', 'Geography_Spain', 'GermanyXAge', 'GermanyXBalance', 'GermanyXSenior']
df_engineered[geography_features].head(10)

## 5. Create Balance-related Features

In [None]:
# Create balance-related features
df_engineered['HasZeroBalance'] = (df_engineered['Balance'] == 0).astype(int)
df_engineered['BalanceToSalaryRatio'] = df_engineered['Balance'] / (df_engineered['EstimatedSalary'] + 1)  # Add 1 to avoid division by zero
high_balance_threshold = 100000  # This is an approximation, should be based on domain knowledge
df_engineered['HasHighBalance'] = (df_engineered['Balance'] > high_balance_threshold).astype(int)

# Display the new features
balance_features = ['Balance', 'HasZeroBalance', 'BalanceToSalaryRatio', 'HasHighBalance']
df_engineered[balance_features].head(10)

## 6. Create Product-related Features

In [None]:
# Create product-related features
df_engineered['HasMultipleProducts'] = (df_engineered['NumOfProducts'] > 1).astype(int)
df_engineered['HasManyProducts'] = (df_engineered['NumOfProducts'] >= 3).astype(int)
df_engineered['ProductsXAge'] = df_engineered['NumOfProducts'] * df_engineered['Age']
df_engineered['ProductsXBalance'] = df_engineered['NumOfProducts'] * df_engineered['Balance']
df_engineered['ProductsXTenure'] = df_engineered['NumOfProducts'] * df_engineered['Tenure']

# Display the new features
product_features = ['NumOfProducts', 'HasMultipleProducts', 'HasManyProducts', 'ProductsXAge', 'ProductsXBalance', 'ProductsXTenure']
df_engineered[product_features].head(10)

## 7. Create Tenure-related Features

In [None]:
# Create tenure-related features
df_engineered['IsNewCustomer'] = (df_engineered['Tenure'] <= 1).astype(int)
df_engineered['IsLongTermCustomer'] = (df_engineered['Tenure'] >= 8).astype(int)
df_engineered['TenureSquared'] = df_engineered['Tenure'] ** 2
df_engineered['CustomerValue'] = df_engineered['Tenure'] * df_engineered['Balance'] / 1000  # Scaled for better interpretability

# Display the new features
tenure_features = ['Tenure', 'IsNewCustomer', 'IsLongTermCustomer', 'TenureSquared', 'CustomerValue']
df_engineered[tenure_features].head(10)

## 8. Create Engagement-related Features

In [None]:
# Create engagement-related features
df_engineered['EngagementScore'] = df_engineered['IsActiveMember'] * 0.5 + df_engineered['HasCrCard'] * 0.3 + (df_engineered['NumOfProducts'] / 4) * 0.2
df_engineered['ActiveXTenure'] = df_engineered['IsActiveMember'] * df_engineered['Tenure']
df_engineered['ActiveXProducts'] = df_engineered['IsActiveMember'] * df_engineered['NumOfProducts']
df_engineered['ActiveXAge'] = df_engineered['IsActiveMember'] * df_engineered['Age']
df_engineered['InactiveSenior'] = ((df_engineered['IsActiveMember'] == 0) & (df_engineered['IsSenior'] == 1)).astype(int)

# Display the new features
engagement_features = ['IsActiveMember', 'HasCrCard', 'EngagementScore', 'ActiveXTenure', 'ActiveXProducts', 'ActiveXAge', 'InactiveSenior']
df_engineered[engagement_features].head(10)

## 9. Create Risk Score Features

In [None]:
# Create risk score features based on EDA insights
df_engineered['ChurnRiskScore'] = (
    df_engineered['IsSenior'] * 0.25 + 
    df_engineered['Geography_Germany'] * 0.20 + 
    (1 - df_engineered['IsActiveMember']) * 0.25 + 
    df_engineered['HasManyProducts'] * 0.20 + 
    (df_engineered['Gender_Male'] == 0).astype(int) * 0.10
)

df_engineered['DemographicRiskScore'] = (
    df_engineered['IsSenior'] * 0.4 + 
    df_engineered['Geography_Germany'] * 0.4 + 
    (df_engineered['Gender_Male'] == 0).astype(int) * 0.2
)

df_engineered['ProductRiskScore'] = (
    df_engineered['HasManyProducts'] * 0.5 + 
    (1 - df_engineered['IsActiveMember']) * 0.3 + 
    (1 - df_engineered['HasCrCard']) * 0.2
)

# Display the new features
risk_features = ['ChurnRiskScore', 'DemographicRiskScore', 'ProductRiskScore']
df_engineered[risk_features].head(10)

## 10. Feature Scaling

In [None]:
# Identify numerical features that need scaling
numerical_features_to_scale = [
    'CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary',
    'BalanceToSalaryRatio', 'ProductsXAge', 'ProductsXBalance', 'ProductsXTenure',
    'TenureSquared', 'CustomerValue', 'GermanyXAge', 'GermanyXBalance'
]

# Create a copy of the dataframe for scaled features
df_scaled = df_engineered.copy()

# Apply standard scaling to numerical features
scaler = StandardScaler()
df_scaled[numerical_features_to_scale] = scaler.fit_transform(df_scaled[numerical_features_to_scale])

# Display the scaled features
df_scaled[numerical_features_to_scale].head()

## 11. Feature Selection

In [None]:
# Prepare data for feature selection
X = df_scaled.drop('Exited', axis=1)
y = df_scaled['Exited']

# Get list of all features
all_features = X.columns.tolist()
print(f"Total number of features: {len(all_features)}")

### 11.1 ANOVA F-value Feature Selection

In [None]:
# Select top features using ANOVA F-value
selector_f = SelectKBest(score_func=f_classif, k=20)
selector_f.fit(X, y)

# Get feature scores
f_scores = pd.DataFrame({
    'Feature': all_features,
    'F_Score': selector_f.scores_,
    'P_Value': selector_f.pvalues_
})

# Sort by F-score
f_scores = f_scores.sort_values('F_Score', ascending=False)

# Display top features
print("Top 20 features selected by ANOVA F-value:")
f_scores.head(20)

In [None]:
# Visualize F-scores
plt.figure(figsize=(12, 10))
ax = sns.barplot(x='F_Score', y='Feature', data=f_scores.head(20))
plt.title('Top 20 Features by F-Score', fontsize=15)
plt.xlabel('F-Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.savefig('../docs/plots/top_features_f_score.png', dpi=300, bbox_inches='tight')
plt.show()

### 11.2 Mutual Information Feature Selection

In [None]:
# Select top features using Mutual Information
selector_mi = SelectKBest(score_func=mutual_info_classif, k=20)
selector_mi.fit(X, y)

# Get feature scores
mi_scores = pd.DataFrame({
    'Feature': all_features,
    'MI_Score': selector_mi.scores_
})

# Sort by MI-score
mi_scores = mi_scores.sort_values('MI_Score', ascending=False)

# Display top features
print("Top 20 features selected by Mutual Information:")
mi_scores.head(20)

In [None]:
# Visualize MI-scores
plt.figure(figsize=(12, 10))
ax = sns.barplot(x='MI_Score', y='Feature', data=mi_scores.head(20))
plt.title('Top 20 Features by Mutual Information', fontsize=15)
plt.xlabel('Mutual Information Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.savefig('../docs/plots/top_features_mi_score.png', dpi=300, bbox_inches='tight')
plt.show()

### 11.3 Compare Feature Selection Methods

In [None]:
# Compare top features from both methods
top_f_features = set(f_scores.head(20)['Feature'])
top_mi_features = set(mi_scores.head(20)['Feature'])

# Find common features
common_features = top_f_features.intersection(top_mi_features)
print(f"Number of common features: {len(common_features)}")
print(f"Common features: {sorted(list(common_features))}")

# Find unique features in each method
unique_f_features = top_f_features - top_mi_features
unique_mi_features = top_mi_features - top_f_features

print(f"\nFeatures unique to F-score: {sorted(list(unique_f_features))}")
print(f"Features unique to MI-score: {sorted(list(unique_mi_features))}")

## 12. Create Feature Sets

In [None]:
# Create different feature sets
feature_sets = {
    'all': all_features,
    'original': ['CreditScore', 'Geography_France', 'Geography_Germany', 'Geography_Spain', 
                'Gender_Female', 'Gender_Male', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                'HasCrCard', 'IsActiveMember', 'EstimatedSalary'],
    'f_score_top20': f_scores.head(20)['Feature'].tolist(),
    'mi_score_top20': mi_scores.head(20)['Feature'].tolist(),
    'common': list(common_features),
    'selected_top': [
        # Top features from both methods
        'Age', 'IsSenior', 'IsActiveMember', 'Geography_Germany', 'InactiveSenior',
        'ChurnRiskScore', 'DemographicRiskScore', 'ProductRiskScore',
        'HasManyProducts', 'HasMultipleProducts', 'NumOfProducts',
        'ActiveXProducts', 'GermanyXSenior', 'Balance', 'HasZeroBalance',
        # Additional features based on domain knowledge
        'Gender_Male', 'Tenure', 'IsNewCustomer', 'EngagementScore'
    ]
}

# Print feature set sizes
for name, features in feature_sets.items():
    print(f"{name}: {len(features)} features")

## 13. Save Engineered Data

In [None]:
# Save unscaled engineered data
df_engineered.to_csv('../data/processed/churn_engineered.csv', index=False)

# Save scaled engineered data
df_scaled.to_csv('../data/processed/churn_engineered_scaled.csv', index=False)

# Save feature sets to JSON
with open('../models/feature_sets.json', 'w') as f:
    json.dump(feature_sets, f, indent=4)

print("Engineered data and feature sets saved successfully.")