# Customer Churn Prediction - Data Exploration

This notebook covers:
- Loading the telecom customer dataset
- Descriptive statistics and data quality checks
- Churn rate analysis
- Distribution visualizations
- Correlation analysis

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from data_loader import TelecomDataGenerator, DataLoader, save_dataset

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

print("✓ Libraries imported successfully")

## 1. Generate and Load Dataset

In [None]:
# Generate synthetic telecom dataset
generator = TelecomDataGenerator(n_samples=7043, random_state=42)
df = generator.generate_dataset()

# Save dataset
data_path = Path('../data/raw/telecom_churn.csv')
save_dataset(df, data_path)

print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

In [None]:
# Display first few rows
df.head(10)

## 2. Data Quality Assessment

In [None]:
# Validate data
loader = DataLoader(data_path)
validation_report = loader.validate_data(df)

In [None]:
# Data types
print("Data Types:")
print(df.dtypes)

In [None]:
# Descriptive statistics for numeric features
df.describe()

In [None]:
# Descriptive statistics for categorical features
df.describe(include=['object'])

## 3. Churn Rate Analysis

In [None]:
# Overall churn rate
churn_counts = df['Churn'].value_counts()
churn_pct = df['Churn'].value_counts(normalize=True) * 100

print("Churn Distribution:")
print(f"No Churn: {churn_counts['No']} ({churn_pct['No']:.2f}%)")
print(f"Churn: {churn_counts['Yes']} ({churn_pct['Yes']:.2f}%)")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='Churn', palette='Set2', ax=ax1)
ax1.set_title('Churn Distribution (Count)', fontsize=14, fontweight='bold')
ax1.set_ylabel('Count')
for container in ax1.containers:
    ax1.bar_label(container)

# Pie chart
colors = sns.color_palette('Set2')
ax2.pie(churn_counts, labels=churn_counts.index, autopct='%1.1f%%', 
        colors=colors, startangle=90)
ax2.set_title('Churn Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Feature Distributions

In [None]:
# Numeric features distribution
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    # Histogram
    axes[idx].hist(df[col].dropna(), bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    
    # Box plot by churn
    sns.boxplot(data=df, x='Churn', y=col, palette='Set2', ax=axes[idx+3])
    axes[idx+3].set_title(f'{col} by Churn Status', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Categorical features vs Churn
categorical_cols = ['Contract', 'InternetService', 'PaymentMethod', 'gender']

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(categorical_cols):
    # Create crosstab
    ct = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
    ct.plot(kind='bar', ax=axes[idx], color=['#2ecc71', '#e74c3c'])
    axes[idx].set_title(f'Churn Rate by {col}', fontweight='bold', fontsize=12)
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Percentage (%)')
    axes[idx].legend(title='Churn', labels=['No', 'Yes'])
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Prepare data for correlation (encode categorical variables)
df_encoded = df.copy()

# Label encode binary variables
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

for col in binary_cols:
    if col in df_encoded.columns:
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Select numeric columns for correlation
numeric_features = df_encoded.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlation matrix
correlation_matrix = df_encoded[numeric_features].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Correlation with Churn
churn_correlation = correlation_matrix['Churn'].sort_values(ascending=False)
print("Features Correlation with Churn:")
print(churn_correlation)

# Visualize
plt.figure(figsize=(10, 8))
churn_correlation.drop('Churn').plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with Churn', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

## 6. Key Insights

Based on the exploratory analysis:

1. **Churn Rate**: Approximately 26-27% of customers churn
2. **Contract Type**: Month-to-month contracts have significantly higher churn rates
3. **Tenure**: Customers with shorter tenure are more likely to churn
4. **Internet Service**: Fiber optic customers show higher churn rates
5. **Payment Method**: Electronic check users have higher churn
6. **Monthly Charges**: Higher charges correlate with increased churn

These insights will guide our feature engineering and modeling approach.

In [None]:
print("✓ Data exploration completed successfully!")
print(f"\nDataset saved to: {data_path}")
print("\nNext steps:")
print("1. Feature Engineering (Notebook 02)")
print("2. Model Training (Notebook 03)")