# Customer Churn Prediction - Data Exploration

This notebook explores the customer churn dataset, performs initial analysis, and prepares data for model training.

## Learning Objectives

- Load and inspect the dataset
- Perform exploratory data analysis (EDA)
- Identify patterns and relationships
- Handle missing values and outliers
- Create visualizations
- Prepare data for preprocessing

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Import our custom modules
import sys
sys.path.append('../src')

from data.preprocessing import ChurnDataPreprocessor, create_sample_dataset
from utils.logging import get_logger

logger = get_logger(__name__)

print("Libraries imported successfully!")

## 1. Load and Inspect Data

In [None]:
# Create sample dataset if it doesn't exist
import os
if not os.path.exists('../data/raw/customer_churn.csv'):
    print("Creating sample dataset...")
    sample_df = create_sample_dataset(5000, '../data/raw/customer_churn.csv')
    print(f"Sample dataset created with {len(sample_df)} rows")
else:
    print("Dataset already exists")

# Load the dataset
df = pd.read_csv('../data/raw/customer_churn.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nDataset columns: {list(df.columns)}")

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
display(df.head())

# Display dataset information
print("\nDataset information:")
display(df.info())

In [None]:
# Basic statistics
print("Descriptive statistics:")
display(df.describe())

# Check data types
print("\nData types:")
display(df.dtypes)

## 2. Target Variable Analysis

In [None]:
# Analyze target variable (churn)
churn_counts = df['churn'].value_counts()
churn_percentage = df['churn'].value_counts(normalize=True) * 100

print("Churn distribution:")
print(f"No Churn (0): {churn_counts[0]} ({churn_percentage[0]:.1f}%)")
print(f"Churn (1): {churn_counts[1]} ({churn_percentage[1]:.1f}%)")

# Visualize churn distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Count plot
sns.countplot(data=df, x='churn', ax=ax1)
ax1.set_title('Churn Count Distribution')
ax1.set_xlabel('Churn (0=No, 1=Yes)')
ax1.set_ylabel('Count')

# Pie chart
ax2.pie(churn_counts, labels=['No Churn', 'Churn'], autopct='%1.1f%%', startangle=90)
ax2.set_title('Churn Percentage Distribution')

plt.tight_layout()
plt.show()

## 3. Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_info = pd.DataFrame({
    'Count': missing_values,
    'Percentage': missing_percentage
})

print("Missing values analysis:")
display(missing_info[missing_info['Count'] > 0].sort_values('Count', ascending=False))

# Visualize missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.show()

## 4. Numerical Features Analysis

In [None]:
# Identify numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [col for col in numerical_features if col not in ['churn', 'senior_citizen']]

print(f"Numerical features: {numerical_features}")

# Plot distributions of numerical features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(numerical_features[:4]):
    sns.histplot(data=df, x=feature, hue='churn', kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature.replace("_", " ").title()}')
    axes[i].legend(['No Churn', 'Churn'])

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
correlation_matrix = df[numerical_features + ['churn']].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

print("Correlation with churn:")
churn_correlation = correlation_matrix['churn'].sort_values(ascending=False)
display(churn_correlation)

## 5. Categorical Features Analysis

In [None]:
# Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print(f"Categorical features: {categorical_features}")

# Plot categorical features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(categorical_features[:4]):
    sns.countplot(data=df, x=feature, hue='churn', ax=axes[i])
    axes[i].set_title(f'{feature.replace("_", " ").title()} by Churn')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].legend(['No Churn', 'Churn'])

plt.tight_layout()
plt.show()

In [None]:
# Churn rate by categorical features
for feature in categorical_features[:3]:
    churn_rate = df.groupby(feature)['churn'].mean().sort_values(ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=churn_rate.index, y=churn_rate.values)
    plt.title(f'Churn Rate by {feature.replace("_", " ").title()}')
    plt.ylabel('Churn Rate')
    plt.xticks(rotation=45)
    
    # Add percentage labels
    for i, rate in enumerate(churn_rate):
        plt.text(i, rate + 0.01, f'{rate:.1%}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

## 6. Advanced Analysis

In [None]:
# Age vs Tenure analysis
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='age', y='tenure', hue='churn', size='monthly_charges', 
                alpha=0.7, sizes=(20, 200))
plt.title('Age vs Tenure colored by Churn')
plt.xlabel('Age')
plt.ylabel('Tenure (months)')
plt.legend(title='Churn')
plt.tight_layout()
plt.show()

In [None]:
# Monthly charges vs Total charges
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='monthly_charges', y='total_charges', hue='churn', 
                style='contract_type', alpha=0.7)
plt.title('Monthly Charges vs Total Charges')
plt.xlabel('Monthly Charges ($)')
plt.ylabel('Total Charges ($)')
plt.legend(title='Churn / Contract')
plt.tight_layout()
plt.show()

In [None]:
# Service usage analysis
service_features = ['phone_service', 'online_security', 'tech_support']

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, feature in enumerate(service_features):
    if feature in df.columns:
        # Convert to categorical for better visualization
        df[feature] = df[feature].astype(str)
        
        sns.countplot(data=df, x=feature, hue='churn', ax=axes[i])
        axes[i].set_title(f'{feature.replace("_", " ").title()}')
        axes[i].set_ylabel('Count')
        
        # Calculate and display churn rate
        churn_rate = df.groupby(feature)['churn'].mean()
        for j, (category, rate) in enumerate(churn_rate.items()):
            axes[i].text(j, df[feature].value_counts()[category] * 0.9, 
                        f'{rate:.1%}', ha='center', va='top', fontweight='bold')

plt.tight_layout()
plt.show()

## 7. Outlier Detection

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(numerical_features[:4]):
    sns.boxplot(data=df, y=feature, ax=axes[i])
    axes[i].set_title(f'Box Plot of {feature.replace("_", " ").title()}')

plt.tight_layout()
plt.show()

# Function to detect outliers using IQR method
def detect_outliers_iqr(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Check outliers for each numerical feature
print("Outlier Analysis:")
for feature in numerical_features:
    outliers, lower, upper = detect_outliers_iqr(df, feature)
    outlier_count = len(outliers)
    outlier_percentage = (outlier_count / len(df)) * 100
    
    print(f"\n{feature}:")
    print(f"  Outliers: {outlier_count} ({outlier_percentage:.2f}%)")
    print(f"  Range: [{lower:.2f}, {upper:.2f}]")

## 8. Feature Relationships

In [None]:
# Pair plot for key features
key_features = ['age', 'tenure', 'monthly_charges', 'total_charges', 'churn']

plt.figure(figsize=(15, 15))
sns.pairplot(df[key_features], hue='churn', diag_kind='kde', 
             plot_kws={'alpha': 0.6}, corner=True)
plt.suptitle('Pair Plot of Key Features', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Advanced: Create interaction features
df['monthly_to_total_ratio'] = df['monthly_charges'] / (df['total_charges'] + 1)
df['tenure_to_age_ratio'] = df['tenure'] / (df['age'] + 1)
df['services_count'] = df[['phone_service', 'online_security', 'tech_support']].sum(axis=1)

# Plot new features
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

new_features = ['monthly_to_total_ratio', 'tenure_to_age_ratio', 'services_count']

for i, feature in enumerate(new_features):
    sns.boxplot(data=df, x='churn', y=feature, ax=axes[i])
    axes[i].set_title(f'{feature.replace("_", " ").title()} by Churn')
    axes[i].set_xlabel('Churn (0=No, 1=Yes)')

plt.tight_layout()
plt.show()

## 9. Data Quality Assessment

In [None]:
# Data quality report
def data_quality_report(df):
    report = {}
    
    # Basic information
    report['shape'] = df.shape
    report['columns'] = list(df.columns)
    
    # Data types
    report['data_types'] = df.dtypes.value_counts().to_dict()
    
    # Missing values
    missing = df.isnull().sum()
    report['missing_values'] = {
        'total_missing': missing.sum(),
        'columns_with_missing': missing[missing > 0].to_dict()
    }
    
    # Duplicates
    report['duplicates'] = df.duplicated().sum()
    
    # Unique values
    report['unique_values'] = df.nunique().to_dict()
    
    # Memory usage
    report['memory_usage'] = df.memory_usage(deep=True).sum()
    
    return report

quality_report = data_quality_report(df)

print("Data Quality Report:")
print(f"Dataset shape: {quality_report['shape']}")
print(f"Total missing values: {quality_report['missing_values']['total_missing']}")
print(f"Duplicate rows: {quality_report['duplicates']}")
print(f"Memory usage: {quality_report['memory_usage'] / 1024 / 1024:.2f} MB")

if quality_report['missing_values']['columns_with_missing']:
    print("\nColumns with missing values:")
    for col, count in quality_report['missing_values']['columns_with_missing'].items():
        print(f"  {col}: {count} missing")

print("\nUnique values per column:")
for col, unique_count in quality_report['unique_values'].items():
    print(f"  {col}: {unique_count} unique values")

## 10. Summary and Insights

In [None]:
# Key insights summary
insights = [
    "1. Dataset contains {} customers with {} features".format(*df.shape),
    "2. Churn rate is {:.1f}%".format(df['churn'].mean() * 100),
    "3. Key features correlated with churn:",
    "   - Tenure (negative correlation)",
    "   - Monthly charges (positive correlation)",
    "   - Contract type (month-to-month has higher churn)",
    "   - Age (younger customers churn more)",
    "4. Data quality is good with minimal missing values",
    "5. Some outliers detected in numerical features",
    "6. Interaction features show promising patterns"
]

print("Key Insights:")
for insight in insights:
    print(insight)

# Recommendations for modeling
recommendations = [
    "1. Handle outliers in numerical features",
    "2. Create more interaction features",
    "3. Consider feature scaling for distance-based algorithms",
    "4. Use ensemble methods to capture complex patterns",
    "5. Address class imbalance if present",
    "6. Perform feature selection to reduce dimensionality"
]

print("\nRecommendations:")
for rec in recommendations:
    print(rec)

## 11. Save Processed Data

Let's save a cleaned version of our data for the next steps.

In [None]:
# Clean the data
df_clean = df.copy()

# Handle missing values (simple imputation)
for col in numerical_features:
    if df_clean[col].isnull().any():
        median_val = df_clean[col].median()
        df_clean[col].fillna(median_val, inplace=True)

for col in categorical_features:
    if df_clean[col].isnull().any():
        mode_val = df_clean[col].mode()[0]
        df_clean[col].fillna(mode_val, inplace=True)

# Handle outliers (capping)
for col in numerical_features:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)

# Save cleaned data
output_path = '../data/processed/customer_churn_cleaned.csv'
df_clean.to_csv(output_path, index=False)

print(f"Cleaned data saved to: {output_path}")
print(f"Final dataset shape: {df_clean.shape}")
print(f"Missing values remaining: {df_clean.isnull().sum().sum()}")

## 12. Next Steps

This data exploration provides a solid foundation for the next phases:

1. **Feature Engineering**: Create more sophisticated features based on patterns discovered
2. **Data Preprocessing**: Scale features, handle categorical variables, split data
3. **Model Training**: Train and evaluate different ML algorithms
4. **Model Evaluation**: Compare performance and select best model
5. **Model Deployment**: Create API for predictions

The insights gained from this exploration will guide feature engineering and model selection decisions.