In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Load the data
print("Loading data...")
data_path = Path('../data/processed/telco_customer_churn_combined.csv')
df = pd.read_csv(data_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

# Check for missing values
print("\nMissing values per column:")
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
if len(missing_values) > 0:
    print(missing_values)
else:
    print("No missing values found.")

# Data types and basic statistics
print("\nData types:")
df.dtypes

print("\nBasic statistics:")
df.describe()

# Explore target variable (Churn)
print("\nChurn distribution:")
churn_counts = df['Churn Label'].value_counts()
print(churn_counts)
churn_percentage = df['Churn Label'].value_counts(normalize=True) * 100
print(f"\nChurn percentage: {churn_percentage['Yes']:.2f}%")

# Visualize churn distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Churn Label', data=df, palette='viridis')
plt.title('Churn Distribution')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.show()

# Explore churn reasons
plt.figure(figsize=(14, 8))
churn_reason_counts = df[df['Churn Label'] == 'Yes']['Churn Reason'].value_counts()
sns.barplot(x=churn_reason_counts.values, y=churn_reason_counts.index, palette='viridis')
plt.title('Reasons for Customer Churn')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

# Explore churn by demographic factors
plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.countplot(x='Gender', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Gender')

plt.subplot(2, 2, 2)
sns.countplot(x='Senior Citizen', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Senior Citizen Status')

plt.subplot(2, 2, 3)
sns.countplot(x='Married', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Marital Status')

plt.subplot(2, 2, 4)
sns.countplot(x='Dependents', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Dependents')

plt.tight_layout()
plt.show()

# Explore churn by service-related factors
plt.figure(figsize=(20, 15))

plt.subplot(3, 3, 1)
sns.countplot(x='Contract', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Contract Type')
plt.xticks(rotation=45)

plt.subplot(3, 3, 2)
sns.countplot(x='Internet Service', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Internet Service')
plt.xticks(rotation=45)

plt.subplot(3, 3, 3)
sns.countplot(x='Payment Method', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Payment Method')
plt.xticks(rotation=45)

plt.subplot(3, 3, 4)
sns.countplot(x='Online Security', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Online Security')

plt.subplot(3, 3, 5)
sns.countplot(x='Online Backup', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Online Backup')

plt.subplot(3, 3, 6)
sns.countplot(x='Device Protection Plan', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Device Protection')

plt.subplot(3, 3, 7)
sns.countplot(x='Premium Tech Support', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Tech Support')

plt.subplot(3, 3, 8)
sns.countplot(x='Streaming TV', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Streaming TV')

plt.subplot(3, 3, 9)
sns.countplot(x='Paperless Billing', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Paperless Billing')

plt.tight_layout()
plt.show()

# Explore numerical features
numerical_features = ['Age', 'Tenure in Months', 'Monthly Charge', 'Total Charges', 'Churn Score', 'CLTV', 'Satisfaction Score']

plt.figure(figsize=(20, 15))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x='Churn Label', y=feature, data=df, palette='viridis')
    plt.title(f'Churn by {feature}')

plt.tight_layout()
plt.show()

# Correlation analysis
correlation_features = ['Age', 'Tenure in Months', 'Monthly Charge', 'Total Charges', 
                        'Churn Score', 'CLTV', 'Satisfaction Score', 'Churn Value']

correlation_df = df[correlation_features]
correlation_matrix = correlation_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

# Tenure analysis
plt.figure(figsize=(14, 7))
sns.histplot(data=df, x='Tenure in Months', hue='Churn Label', bins=24, kde=True, palette='viridis')
plt.title('Distribution of Tenure by Churn Status')
plt.xlabel('Tenure in Months')
plt.ylabel('Count')
plt.show()

# Monthly charges analysis
plt.figure(figsize=(14, 7))
sns.histplot(data=df, x='Monthly Charge', hue='Churn Label', bins=30, kde=True, palette='viridis')
plt.title('Distribution of Monthly Charges by Churn Status')
plt.xlabel('Monthly Charge')
plt.ylabel('Count')
plt.show()

# Satisfaction score analysis
plt.figure(figsize=(12, 6))
sns.countplot(x='Satisfaction Score', hue='Churn Label', data=df, palette='viridis')
plt.title('Churn by Satisfaction Score')
plt.xlabel('Satisfaction Score')
plt.ylabel('Count')
plt.show()

# Churn by city
plt.figure(figsize=(12, 6))
city_churn = df.groupby('City')['Churn Value'].mean().sort_values(ascending=False).reset_index()
sns.barplot(x='City', y='Churn Value', data=city_churn, palette='viridis')
plt.title('Churn Rate by City')
plt.xlabel('City')
plt.ylabel('Churn Rate')
plt.show()

# Key insights summary
print("\nKey Insights from Exploratory Data Analysis:")
print("1. Contract type has a strong relationship with churn - month-to-month contracts have higher churn rates.")
print("2. Customers with shorter tenure are more likely to churn.")
print("3. Higher monthly charges correlate with increased churn probability.")
print("4. Customers without additional services (security, backup, tech support) churn more frequently.")
print("5. Payment method shows correlation with churn - electronic payments have different patterns than mailed checks.")
print("6. Lower satisfaction scores strongly predict churn.")
print("7. Senior citizens appear to have higher churn rates.")

# Save processed data for modeling
print("\nPreparing data for modeling...")

# Convert categorical variables to binary where appropriate
binary_vars = ['Gender', 'Senior Citizen', 'Married', 'Dependents', 'Phone Service', 
               'Multiple Lines', 'Online Security', 'Online Backup', 'Device Protection Plan',
               'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music',
               'Unlimited Data', 'Paperless Billing']

# Create a clean dataframe for modeling
model_df = df.copy()

# Use Churn Value (0/1) as target
model_df['Churn'] = model_df['Churn Value']

# Save the processed dataframe for modeling
model_df.to_csv('../data/processed/telco_churn_for_modeling.csv', index=False)
print("Data prepared and saved for modeling.")