# 05 – Data Preparation and EDA

1. **Correlation Analysis**
   - Pearson, Spearman, and Kendall correlations
   - Feature importance ranking
   - Multicollinearity detection

2. **Statistical Analysis**
   - Distribution tests
   - Hypothesis testing
   - A/B test analysis
   - Chi-square tests for categorical variables

3. **Advanced Visualizations**
   - Interactive plots with Plotly
   - Feature relationships
   - Customer segmentation
   - Time series patterns

4. **Customer Behavior Analysis**
   - Cohort analysis
   - Customer segmentation
   - Behavioral patterns
   - Churn risk factors

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# Set up notebook for plotting with correct style
plt.style.use('seaborn-v0_8-whitegrid')  # Using a valid matplotlib style
sns.set_theme(style="whitegrid", palette="deep")

In [2]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

In [3]:
from scripts.preparation import prepare_dataset
import os
# Paths
raw_root = os.path.join('..', 'data', 'raw')
prepared_dir = os.path.join('..', 'data', 'prepared')
charts_dir = os.path.join('..', 'docs', 'charts')

prepared_path = prepare_dataset(raw_root, prepared_dir, charts_dir)
import pandas as pd
df = pd.read_csv(prepared_path)  # Assign to df for use in later cells
cleaned_df = df.copy()  # Keep a backup copy
display(df.head())

2025-08-24 15:28:19 - preparation - INFO - Prepared dataset saved to ..\data\prepared\customers_prepared.csv with 5000 rows.


Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure_months,monthly_charges,total_charges,contract,internet_service,phone_service,churn
0,C000001,Male,No,No,No,61,94.08,5687.83,Month-to-month,Fiber optic,Yes,1
1,C000002,Female,No,No,Yes,33,119.58,3915.94,Month-to-month,DSL,Yes,0
2,C000003,Female,Yes,Yes,Yes,50,24.1,1231.7,One year,Fiber optic,Yes,0
3,C000004,Male,No,Yes,No,70,44.49,3113.11,Month-to-month,Fiber optic,Yes,1
4,C000005,Male,No,No,No,3,79.77,215.45,One year,DSL,Yes,0


EDA charts have been saved in the `docs/charts` folder.

In [4]:
def analyze_correlations(df, target_col=None):
    """Analyze correlations using multiple methods."""
    # Calculate correlations using different methods
    pearson_corr = df.corr(method='pearson')
    spearman_corr = df.corr(method='spearman')
    kendall_corr = df.corr(method='kendall')
    
    # Create subplots for correlation matrices
    fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=('Pearson Correlation', 'Spearman Correlation', 'Kendall Correlation')
    )
    
    # Add heatmaps
    fig.add_trace(
        go.Heatmap(z=pearson_corr, x=pearson_corr.columns, y=pearson_corr.columns,
                   colorscale='RdBu', zmin=-1, zmax=1),
        row=1, col=1
    )
    fig.add_trace(
        go.Heatmap(z=spearman_corr, x=spearman_corr.columns, y=spearman_corr.columns,
                   colorscale='RdBu', zmin=-1, zmax=1),
        row=1, col=2
    )
    fig.add_trace(
        go.Heatmap(z=kendall_corr, x=kendall_corr.columns, y=kendall_corr.columns,
                   colorscale='RdBu', zmin=-1, zmax=1),
        row=1, col=3
    )
    
    fig.update_layout(height=600, width=1800, title_text="Correlation Analysis")
    fig.show()
    
    # If target column is specified, show feature importance
    if target_col and target_col in df.columns:
        importance = abs(pearson_corr[target_col]).sort_values(ascending=False)
        print(f"\nFeature Importance (correlation with {target_col}):")
        print(importance)
        
    return pearson_corr, spearman_corr, kendall_corr

def detect_multicollinearity(df, threshold=0.8):
    """Detect highly correlated features."""
    corr_matrix = df.corr()
    high_corr_pairs = []
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr_pairs.append({
                    'feature1': corr_matrix.columns[i],
                    'feature2': corr_matrix.columns[j],
                    'correlation': corr_matrix.iloc[i, j]
                })
    
    return pd.DataFrame(high_corr_pairs)

In [5]:
def perform_statistical_tests(df, numerical_cols=None, categorical_cols=None, target_col='churn'):
    """Perform various statistical tests on the data."""
    results = []
    
    # Prepare data
    if numerical_cols is None:
        numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    if categorical_cols is None:
        categorical_cols = df.select_dtypes(include=['object']).columns
    
    # Normality tests for numerical columns
    for col in numerical_cols:
        stat, p_value = stats.normaltest(df[col].dropna())
        results.append({
            'test': 'Normality',
            'feature': col,
            'statistic': stat,
            'p_value': p_value,
            'significant': p_value < 0.05
        })
    
    # Chi-square tests for categorical columns
    if target_col in df.columns:
        for col in categorical_cols:
            if col != target_col:
                contingency = pd.crosstab(df[col], df[target_col])
                stat, p_value, dof, expected = stats.chi2_contingency(contingency)
                results.append({
                    'test': 'Chi-square',
                    'feature': col,
                    'statistic': stat,
                    'p_value': p_value,
                    'significant': p_value < 0.05
                })
    
    return pd.DataFrame(results)

def perform_cohort_analysis(df, time_col='tenure_months', cohort_col='contract_type', metric_col='monthly_charges'):
    """Perform cohort analysis."""
    # Convert columns to numeric if needed
    if df[metric_col].dtype == 'object':
        df[metric_col] = pd.to_numeric(df[metric_col], errors='coerce')
    
    # Create cohort groups
    cohorts = df.groupby(cohort_col).agg({
        time_col: 'mean',
        metric_col: ['mean', 'count', 'sum']
    })
    
    # Calculate retention metrics
    cohorts.columns = ['avg_tenure', 'avg_value', 'size', 'total_value']
    # Handle case differences in 'churn' values
    is_not_churned = df['churn'].str.lower().isin(['no', 'false', '0'])
    cohorts['retention_rate'] = df[is_not_churned].groupby(cohort_col).size() / cohorts['size']
    
    return cohorts

## Statistical Analysis and Cohort Segmentation

In this section, we'll perform detailed statistical analysis on our customer data:

1. **Statistical Tests**
   - Normality tests for numerical features
   - Chi-square tests for categorical features vs churn
   - Significance testing at p < 0.05 level

2. **Cohort Analysis**
   - Group customers by contract type
   - Analyze tenure and monetary metrics
   - Calculate retention rates per cohort
   
This analysis will help us identify:
- Significant relationships between features and churn
- Customer segments with distinct behavioral patterns
- Key drivers of customer retention

In [6]:
# Prepare numeric columns - ensure they are numeric type
for col in ['monthly_charges', 'total_charges']:
    df[col] = pd.to_numeric(df[col].replace('[\$,]', '', regex=True), errors='coerce')

# Run statistical tests
numerical_cols = ['tenure_months', 'monthly_charges', 'total_charges']
categorical_cols = ['contract_type', 'internet_service_type', 'payment_type']

# Display available columns and their types for verification
print("Available columns:")
print(df.columns.tolist())
print("\nColumn types:")
print(df.dtypes)

# Perform statistical analysis only on available columns
numerical_cols = [col for col in numerical_cols if col in df.columns]
categorical_cols = [col for col in categorical_cols if col in df.columns]

print("\nAnalyzing columns:")
print("Numerical:", numerical_cols)
print("Categorical:", categorical_cols)

# Perform statistical analysis
stat_results = perform_statistical_tests(df, numerical_cols, categorical_cols)
display(stat_results)

# Perform cohort analysis if contract_type is available
if 'contract_type' in df.columns:
    cohort_results = perform_cohort_analysis(df, cohort_col='contract_type')
    display(cohort_results)

    # Visualize cohort results
    plt.figure(figsize=(12, 6))
    sns.barplot(data=cohort_results.reset_index(), x='contract_type', y='retention_rate')
    plt.title('Retention Rate by Contract Type')
    plt.xticks(rotation=45)
    plt.show()

    # Plot average customer value by cohort
    plt.figure(figsize=(12, 6))
    sns.barplot(data=cohort_results.reset_index(), x='contract_type', y='avg_value')
    plt.title('Average Customer Value by Contract Type')
    plt.xticks(rotation=45)
    plt.show()
else:
    print("\nWarning: 'contract_type' column not found. Available columns:", df.columns.tolist())

Available columns:
['customer_id', 'gender', 'senior_citizen', 'partner', 'dependents', 'tenure_months', 'monthly_charges', 'total_charges', 'contract', 'internet_service', 'phone_service', 'churn']

Column types:
customer_id          object
gender               object
senior_citizen       object
partner              object
dependents           object
tenure_months         int64
monthly_charges     float64
total_charges       float64
contract             object
internet_service     object
phone_service        object
churn                 int64
dtype: object

Analyzing columns:
Numerical: ['tenure_months', 'monthly_charges', 'total_charges']
Categorical: []


Unnamed: 0,test,feature,statistic,p_value,significant
0,Normality,tenure_months,4370.86178,0.0,True
1,Normality,monthly_charges,4608.410882,0.0,True
2,Normality,total_charges,428.45223,9.178684e-94,True





In [7]:
# Check data types and unique values
print("Data Types:")
print(df.dtypes)
print("\nUnique values in categorical columns:")
for col in ['contract_type', 'internet_service_type', 'payment_type', 'churn']:
    if col in df.columns:
        print(f"\n{col}:", df[col].unique())

Data Types:
customer_id          object
gender               object
senior_citizen       object
partner              object
dependents           object
tenure_months         int64
monthly_charges     float64
total_charges       float64
contract             object
internet_service     object
phone_service        object
churn                 int64
dtype: object

Unique values in categorical columns:

churn: [1 0]


## Interpretation of Statistical Analysis

The statistical analysis reveals several key insights:

1. **Normality Tests**
   - Check if numerical features follow a normal distribution
   - Helps determine appropriate statistical methods for further analysis

2. **Chi-square Tests**
   - Evaluate relationships between categorical variables and churn
   - Identify which categorical features have significant associations with customer churn

3. **Cohort Analysis Insights**
   - Compare retention rates across different contract types
   - Analyze customer value distribution among cohorts
   - Identify high-value and at-risk customer segments

These insights will inform our feature engineering and modeling approach in later notebooks.

In [8]:
def perform_customer_segmentation(df, features=None):
    """Perform customer segmentation using K-means clustering."""
    if features is None:
        features = ['tenure_months', 'monthly_charges', 'total_charges']
    
    # Prepare data
    X = df[features].copy()
    
    # Handle missing values
    X = X.fillna(X.mean())
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Determine optimal number of clusters using elbow method
    inertias = []
    K = range(1, 6)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        inertias.append(kmeans.inertia_)
    
    # Plot elbow curve
    plt.figure(figsize=(10, 6))
    plt.plot(K, inertias, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Inertia')
    plt.title('Elbow Method For Optimal k')
    plt.show()
    
    # Perform clustering with optimal k (using k=3 as example)
    kmeans = KMeans(n_clusters=3, random_state=42)
    df['Cluster'] = kmeans.fit_predict(X_scaled)
    
    return df, kmeans, scaler

# Perform customer segmentation
df_segmented, kmeans_model, scaler = perform_customer_segmentation(df)

# Analyze segments
segment_profile = df_segmented.groupby('Cluster').agg({
    'tenure_months': 'mean',
    'monthly_charges': 'mean',
    'total_charges': 'mean',
    'churn': lambda x: (x == 'Yes').mean()
}).round(2)

display(segment_profile.rename(columns={'churn': 'churn_rate'}))

Unnamed: 0_level_0,tenure_months,monthly_charges,total_charges,churn_rate
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,14.21,69.45,1003.19,0.0
1,52.36,95.73,4952.13,0.0
2,48.57,43.81,2120.53,0.0


## Customer Segmentation Analysis

We'll now perform customer segmentation using K-means clustering to identify distinct customer groups based on:
- Tenure
- Monthly charges
- Total charges

The segmentation process includes:
1. Data preparation and scaling
2. Optimal cluster number determination using the elbow method
3. K-means clustering
4. Segment profiling and analysis

This segmentation will help us:
- Identify high-value customer segments
- Understand churn risk patterns
- Develop targeted retention strategies

In [9]:
# Visualize segments
def plot_segments(df, features, cluster_col='Cluster'):
    """Create interactive scatter plots of customer segments using Plotly."""
    print("Preparing data for visualization...")
    
    # Ensure numeric data types
    for col in ['tenure_months', 'monthly_charges', 'total_charges']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    print("Creating segment scatter plots...")
    # Create subplot figure
    fig = make_subplots(rows=1, cols=3,
                        subplot_titles=('Tenure vs Monthly Charges',
                                      'Tenure vs Total Charges',
                                      'Monthly vs Total Charges'))
    
    # Plot tenure vs monthly charges
    fig.add_trace(
        go.Scatter(x=df['tenure_months'], y=df['monthly_charges'],
                  mode='markers', marker=dict(color=df[cluster_col]),
                  name='Segments', showlegend=False),
        row=1, col=1
    )
    
    # Plot tenure vs total charges
    fig.add_trace(
        go.Scatter(x=df['tenure_months'], y=df['total_charges'],
                  mode='markers', marker=dict(color=df[cluster_col]),
                  name='Segments', showlegend=False),
        row=1, col=2
    )
    
    # Plot monthly vs total charges
    fig.add_trace(
        go.Scatter(x=df['monthly_charges'], y=df['total_charges'],
                  mode='markers', marker=dict(color=df[cluster_col]),
                  name='Segments', showlegend=False),
        row=1, col=3
    )
    
    # Update layout
    fig.update_layout(height=500, width=1200, title_text="Customer Segments Analysis")
    fig.show()

print("Starting visualization process...")

# Plot customer segments
print("\nCreating segment visualizations...")
plot_segments(df_segmented, ['tenure_months', 'monthly_charges', 'total_charges'])

# Analyze churn rates by segment
print("\nAnalyzing churn distribution...")
churn_by_segment = df_segmented.groupby(['Cluster', 'churn']).size().unstack(fill_value=0)

# Create stacked bar chart using Plotly
fig = go.Figure(data=[
    go.Bar(name=col, x=churn_by_segment.index, y=churn_by_segment[col])
    for col in churn_by_segment.columns
])

# Change the bar mode to stacked
fig.update_layout(
    barmode='stack',
    title='Churn Distribution by Customer Segment',
    xaxis_title='Segment',
    yaxis_title='Number of Customers',
    height=500,
    width=800
)

print("\nDisplaying churn distribution plot...")
fig.show()

print("\nVisualization process complete!")

Starting visualization process...

Creating segment visualizations...
Preparing data for visualization...
Creating segment scatter plots...



Analyzing churn distribution...

Displaying churn distribution plot...



Visualization process complete!
