# Customer Churn Prediction - Data Exploration

**Author:** Shashank Lodhi  
**Date:** November 2025  
**Objective:** Explore and understand the telecom customer churn dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('âœ… Libraries imported successfully!')

## 1. Load Dataset

In [None]:
# Load data
df = pd.read_csv('../data/raw/Telco-Customer-Churn.csv')

print(f'Dataset shape: {df.shape}')
print(f'\nFirst few rows:')
df.head()

## 2. Basic Information

In [None]:
# Dataset info
print('Dataset Info:')
df.info()

print('\n' + '='*50)
print('Statistical Summary:')
df.describe()

## 3. Missing Values Analysis

In [None]:
# Check missing values
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

print('Missing Values:')
print(missing)

# Visualize
plt.figure(figsize=(10, 4))
missing.plot(kind='barh', color='#FF5459')
plt.title('Missing Values by Column')
plt.xlabel('Count')
plt.show()

## 4. Churn Distribution

In [None]:
# Churn distribution
churn_counts = df['Churn'].value_counts()
churn_rate = (churn_counts['Yes'] / len(df)) * 100

print(f'Churn Rate: {churn_rate:.2f}%')
print(f'\nChurn Distribution:')
print(churn_counts)

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
churn_counts.plot(kind='bar', ax=ax1, color=['#21808D', '#FF5459'])
ax1.set_title('Churn Distribution')
ax1.set_xlabel('Churn')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['No', 'Yes'], rotation=0)

# Pie chart
churn_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%',                    colors=['#21808D', '#FF5459'], startangle=90)
ax2.set_title('Churn Percentage')
ax2.set_ylabel('')

plt.tight_layout()
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Analyze churn by categorical features
categorical_cols = ['Contract', 'PaymentMethod', 'InternetService']

for col in categorical_cols:
    churn_by_cat = df.groupby(col)['Churn'].apply(
        lambda x: (x == 'Yes').sum() / len(x) * 100
    ).sort_values(ascending=False)
    
    plt.figure(figsize=(10, 4))
    churn_by_cat.plot(kind='bar', color='#32B8C6')
    plt.title(f'Churn Rate by {col}')
    plt.ylabel('Churn Rate (%)')
    plt.xlabel(col)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print(f'\nChurn Rate by {col}:')
    print(churn_by_cat)
    print('='*50)

## 6. Numerical Features Analysis

In [None]:
# Analyze numerical features
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

for col in numerical_cols:
    plt.figure(figsize=(12, 4))
    
    # Distribution by churn
    df[df['Churn'] == 'Yes'][col].hist(bins=30, alpha=0.7,                                          label='Churned', color='#FF5459')
    df[df['Churn'] == 'No'][col].hist(bins=30, alpha=0.7,                                         label='Retained', color='#21808D')
    
    plt.title(f'{col} Distribution by Churn Status')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()
    
    print(f'\n{col} Statistics:')
    print(df.groupby('Churn')[col].describe())
    print('='*80)

## 7. Correlation Analysis

In [None]:
# Correlation heatmap
numeric_df = df.select_dtypes(include=[np.number])

plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f',             cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## 8. Key Insights

### Summary of Findings:

1. **Churn Rate**: 26.5% of customers churn
2. **Contract Type**: Month-to-month contracts have highest churn
3. **Tenure**: New customers (<12 months) churn more
4. **Payment Method**: Electronic check users churn more
5. **Monthly Charges**: Higher charges correlate with higher churn

### Next Steps:

- Feature engineering
- Handle class imbalance
- Model training and evaluation