# Telco Customer Churn - Exploratory Data Analysis (EDA)

**Author:** Ngo Anh Hieu

This notebook contains **only EDA**. For model training, see `src/train_models.py`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## 1. Load Data

In [None]:
df = pd.read_csv('../data/Telco-Customer-Churn.csv')
print(f'Dataset shape: {df.shape}')
df.head()

## 2. Data Overview

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Check missing values
print('Missing values:')
print(df.isnull().sum())

## 3. Target Variable Analysis

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
df['Churn'].value_counts().plot(kind='bar', ax=ax[0], color=['green', 'red'])
ax[0].set_title('Churn Distribution')
ax[0].set_xlabel('Churn')
ax[0].set_ylabel('Count')

# Pie chart
df['Churn'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%', colors=['lightgreen', 'salmon'])
ax[1].set_title('Churn Percentage')

plt.tight_layout()
plt.savefig('../models/churn_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f'Churn Rate: {(df["Churn"] == "Yes").mean()*100:.1f}%')

## 4. Categorical Features vs Churn

In [None]:
cat_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
            'PhoneService', 'InternetService', 'Contract', 'PaymentMethod']

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    churn_rates = df.groupby(col)['Churn'].apply(lambda x: (x=='Yes').mean() * 100)
    churn_rates.plot(kind='bar', ax=axes[i], color='coral')
    axes[i].set_title(f'Churn Rate by {col}')
    axes[i].set_ylabel('Churn Rate (%)')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../models/churn_by_categories.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Numeric Features vs Churn

In [None]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, col in enumerate(num_cols):
    df.boxplot(column=col, by='Churn', ax=axes[i])
    axes[i].set_title(f'{col} by Churn')

plt.suptitle('')
plt.tight_layout()
plt.savefig('../models/numeric_by_churn.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Correlation Heatmap

In [None]:
df_corr = df.copy()
df_corr['Churn'] = (df_corr['Churn'] == 'Yes').astype(int)

num_for_corr = df_corr[['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen', 'Churn']]

plt.figure(figsize=(8, 6))
sns.heatmap(num_for_corr.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('../models/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Key Insights

1. **Churn Rate:** ~26.5% of customers churned
2. **Contract:** Month-to-month contracts have highest churn (~43%)
3. **Tenure:** New customers (0-12 months) have highest churn
4. **Payment:** Electronic check has highest churn (~45%)
5. **Monthly Charges:** Churners have higher monthly charges

---

**For model training:** `python src/train_models.py`