In [1]:
# =====================================================
# TELCO CUSTOMER CHURN - COMPLETE DATA ANALYSIS
# =====================================================
# Dataset: Telco Customer Churn Data
# Purpose: Data Loading, Cleaning, and Preparation
# =====================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# =====================================================
# 1. LOAD DATA
# =====================================================

print("="*60)
print("LOADING DATASET")
print("="*60)

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df_original = df.copy()  # Keep original for comparison

print(f"✓ Dataset loaded successfully!")
print(f"✓ Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")

LOADING DATASET
✓ Dataset loaded successfully!
✓ Shape: 7043 rows × 21 columns



In [2]:
# =====================================================
# 2. INITIAL DATA EXPLORATION
# =====================================================

print("="*60)
print("DATASET PREVIEW (First 5 Rows)")
print("="*60)
print(df.head())

print("\n" + "="*60)
print("DATASET INFORMATION")
print("="*60)
df.info()

print("\n" + "="*60)
print("STATISTICAL SUMMARY")
print("="*60)
print(df.describe())

print("\n" + "="*60)
print("COLUMN NAMES")
print("="*60)
for idx, col in enumerate(df.columns, 1):
    print(f"{idx:2d}. {col}")


DATASET PREVIEW (First 5 Rows)
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Strea

In [3]:
# =====================================================
# 3. VISUALIZATION: FIRST 5 ROWS
# =====================================================

def visualize_first_rows(dataframe, n=5):
    """Create a clean table visualization of first n rows"""
    
    first_n = dataframe.head(n)
    
    fig, ax = plt.subplots(figsize=(20, 4))
    ax.axis('off')
    
    table = ax.table(
        cellText=first_n.values,
        colLabels=first_n.columns,
        cellLoc='center',
        loc='center',
        bbox=[0, 0, 1, 1]
    )
    
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 2.5)
    
    # Style header
    for i in range(len(first_n.columns)):
        table[(0, i)].set_facecolor('#4CAF50')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Style rows with alternating colors
    for i in range(1, n + 1):
        for j in range(len(first_n.columns)):
            table[(i, j)].set_facecolor('#f9f9f9' if i % 2 == 0 else 'white')
    
    plt.savefig('first_5_rows.png', dpi=300, bbox_inches='tight', facecolor='white')
    print("\n✓ Saved: first_5_rows.png")
    plt.close()

visualize_first_rows(df)


✓ Saved: first_5_rows.png


In [4]:
# =====================================================
# 4. VISUALIZATION: COLUMN NAMES
# =====================================================

def visualize_column_names(dataframe):
    """Create a styled list of all column names"""
    
    columns = dataframe.columns.tolist()
    num_columns = len(columns)
    
    fig, ax = plt.subplots(figsize=(12, max(8, num_columns * 0.4)))
    ax.axis('off')
    
    # Title
    plt.text(0.5, 0.95, f'Dataset Columns ({num_columns} total)', 
             ha='center', va='top', fontsize=18, fontweight='bold',
             transform=ax.transAxes)
    
    # Create column list layout
    y_start = 0.88
    y_step = 0.85 / num_columns
    
    for idx, col in enumerate(columns):
        y_pos = y_start - (idx * y_step)
        
        # Styled box for each column
        rect = mpatches.FancyBboxPatch(
            (0.05, y_pos - 0.015), 0.9, 0.03,
            boxstyle="round,pad=0.01",
            facecolor='#4CAF50' if idx % 2 == 0 else '#66BB6A',
            edgecolor='#2E7D32',
            transform=ax.transAxes,
            linewidth=1.5
        )
        ax.add_patch(rect)
        
        # Column number and name
        plt.text(0.08, y_pos, f"{idx + 1}.", 
                 ha='left', va='center', fontsize=11, fontweight='bold',
                 color='white', transform=ax.transAxes)
        
        plt.text(0.13, y_pos, col, 
                 ha='left', va='center', fontsize=11,
                 color='white', transform=ax.transAxes)
    
    # Footer
    plt.text(0.5, 0.02, f'Total Features: {num_columns}', 
             ha='center', va='bottom', fontsize=10, style='italic',
             transform=ax.transAxes, color='#555')
    
    plt.tight_layout()
    plt.savefig('column_names.png', dpi=300, bbox_inches='tight', facecolor='white')
    print("✓ Saved: column_names.png")
    plt.close()

visualize_column_names(df)

✓ Saved: column_names.png


In [5]:
# =====================================================
# 5. MISSING VALUES ANALYSIS
# =====================================================

def analyze_missing_values(dataframe):
    """Comprehensive missing values analysis"""
    
    missing_data = pd.DataFrame({
        'Column': dataframe.columns,
        'Missing_Count': dataframe.isnull().sum(),
        'Missing_Percentage': (dataframe.isnull().sum() / len(dataframe)) * 100,
        'Data_Type': dataframe.dtypes.astype(str)
    })
    
    missing_data = missing_data.sort_values('Missing_Percentage', ascending=False)
    missing_data_filtered = missing_data[missing_data['Missing_Count'] > 0]
    
    print("\n" + "="*60)
    print("MISSING VALUES SUMMARY")
    print("="*60)
    print(f"Total Rows: {len(dataframe)}")
    print(f"Total Columns: {len(dataframe.columns)}")
    print(f"Columns with Missing Values: {len(missing_data_filtered)}")
    
    if len(missing_data_filtered) > 0:
        print("\nMissing Values Details:")
        print(missing_data_filtered.to_string(index=False))
    else:
        print("\n✓ No missing values detected!")
    
    return missing_data_filtered

missing_values = analyze_missing_values(df)


MISSING VALUES SUMMARY
Total Rows: 7043
Total Columns: 21
Columns with Missing Values: 0

✓ No missing values detected!


In [6]:
# =====================================================
# 6. VISUALIZATION: MISSING VALUES
# =====================================================

def visualize_missing_values(dataframe, missing_df):
    """Create comprehensive missing values visualization"""
    
    if len(missing_df) == 0:
        # No missing values - success message
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.axis('off')
        
        plt.text(0.5, 0.5, '✓ NO MISSING VALUES FOUND!', 
                 ha='center', va='center', fontsize=24, fontweight='bold',
                 color='#27ae60', transform=ax.transAxes)
        
        plt.text(0.5, 0.3, f'All {len(dataframe.columns)} columns are complete', 
                 ha='center', va='center', fontsize=14,
                 color='#555', transform=ax.transAxes)
        
        plt.savefig('missing_values_analysis.png', dpi=300, bbox_inches='tight', facecolor='white')
        print("\n✓ Saved: missing_values_analysis.png")
        plt.close()
        return
    
    # Create dual visualization for missing values
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, max(6, len(missing_df) * 0.5)))
    
    # Color coding by severity
    colors = [
        '#e74c3c' if x > 50 else '#f39c12' if x > 20 else '#3498db'
        for x in missing_df['Missing_Percentage']
    ]
    
    # Left: Horizontal bar chart
    bars = ax1.barh(
        missing_df['Column'],
        missing_df['Missing_Percentage'],
        color=colors,
        edgecolor='black',
        linewidth=1.2
    )
    
    ax1.set_xlabel('Missing Percentage (%)', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Columns', fontsize=12, fontweight='bold')
    ax1.set_title('Missing Values by Column', fontsize=14, fontweight='bold', pad=20)
    ax1.grid(axis='x', alpha=0.3, linestyle='--')
    ax1.set_xlim(0, 100)
    
    # Add percentage labels
    for bar, pct in zip(bars, missing_df['Missing_Percentage']):
        ax1.text(
            bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
            f'{pct:.2f}%',
            ha='left', va='center', fontsize=9, fontweight='bold'
        )
    
    # Right: Detailed table
    ax2.axis('off')
    
    table_data = [
        [row['Column'], f"{int(row['Missing_Count'])}", 
         f"{row['Missing_Percentage']:.2f}%", str(row['Data_Type'])]
        for _, row in missing_df.head(15).iterrows()
    ]
    
    table = ax2.table(
        cellText=table_data,
        colLabels=['Column', 'Count', 'Missing %', 'Type'],
        cellLoc='left',
        loc='center',
        bbox=[0, 0, 1, 1]
    )
    
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 2)
    
    # Style header
    for i in range(4):
        table[(0, i)].set_facecolor('#e74c3c')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Style rows with color coding
    for i in range(1, len(table_data) + 1):
        pct = float(table_data[i-1][2].strip('%'))
        color = '#ffcccc' if pct > 50 else '#ffe6cc' if pct > 20 else '#cce6ff'
        for j in range(4):
            table[(i, j)].set_facecolor(color)
    
    ax2.set_title('Missing Values Details', fontsize=14, fontweight='bold', pad=20)
    
    fig.suptitle(
        f'Missing Values Analysis - {len(missing_df)} Columns Affected',
        fontsize=16, fontweight='bold', y=0.98
    )
    
    plt.tight_layout()
    plt.savefig('missing_values_analysis.png', dpi=300, bbox_inches='tight', facecolor='white')
    print("\n✓ Saved: missing_values_analysis.png")
    plt.close()

visualize_missing_values(df, missing_values)



✓ Saved: missing_values_analysis.png


In [7]:
# =====================================================
# 7. DATA TYPES ANALYSIS
# =====================================================

def analyze_data_types(dataframe):
    """Analyze and visualize data types distribution"""
    
    dtype_df = pd.DataFrame({
        "Column": dataframe.columns,
        "Data_Type": dataframe.dtypes.astype(str)
    })
    
    dtype_counts = dtype_df["Data_Type"].value_counts()
    
    print("\n" + "="*60)
    print("DATA TYPES SUMMARY")
    print("="*60)
    print(dtype_df.to_string(index=False))
    
    print("\n" + "="*60)
    print("DATA TYPE DISTRIBUTION")
    print("="*60)
    for dtype, count in dtype_counts.items():
        print(f"{dtype:15s} : {count} columns")
    
    return dtype_df, dtype_counts

dtype_info, dtype_distribution = analyze_data_types(df)


DATA TYPES SUMMARY
          Column Data_Type
      customerID    object
          gender    object
   SeniorCitizen     int64
         Partner    object
      Dependents    object
          tenure     int64
    PhoneService    object
   MultipleLines    object
 InternetService    object
  OnlineSecurity    object
    OnlineBackup    object
DeviceProtection    object
     TechSupport    object
     StreamingTV    object
 StreamingMovies    object
        Contract    object
PaperlessBilling    object
   PaymentMethod    object
  MonthlyCharges   float64
    TotalCharges    object
           Churn    object

DATA TYPE DISTRIBUTION
object          : 18 columns
int64           : 2 columns
float64         : 1 columns


In [8]:
# =====================================================
# 8. VISUALIZATION: DATA TYPES
# =====================================================

def visualize_data_types(dtype_df, dtype_counts):
    """Create comprehensive data types visualization"""
    
    fig, (ax1, ax2) = plt.subplots(
        1, 2, figsize=(18, max(8, len(dtype_df) * 0.4))
    )
    
    # Left: Table of all columns with their types
    ax1.axis('off')
    table = ax1.table(
        cellText=dtype_df.values.tolist(),
        colLabels=["Column Name", "Data Type"],
        cellLoc='left',
        loc='center',
        bbox=[0, 0, 1, 1]
    )
    
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.5)
    
    # Style header
    for col in range(2):
        table[(0, col)].set_facecolor('#34495e')
        table[(0, col)].set_text_props(color='white', weight='bold')
    
    # Style rows
    for i in range(1, len(dtype_df) + 1):
        for j in range(2):
            table[(i, j)].set_facecolor('#ecf0f1')
    
    ax1.set_title("Column Data Types", fontsize=14, fontweight='bold', pad=20)
    
    # Right: Bar chart of type distribution
    colors_palette = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6']
    bars = ax2.bar(
        range(len(dtype_counts)),
        dtype_counts.values,
        color=colors_palette[:len(dtype_counts)],
        edgecolor='black',
        linewidth=1.2
    )
    
    ax2.set_xticks(range(len(dtype_counts)))
    ax2.set_xticklabels(dtype_counts.index, rotation=45, ha='right')
    ax2.set_title("Data Type Distribution", fontsize=14, fontweight='bold')
    ax2.set_xlabel("Data Type", fontsize=12, fontweight='bold')
    ax2.set_ylabel("Number of Columns", fontsize=12, fontweight='bold')
    ax2.grid(axis='y', linestyle='--', alpha=0.4)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax2.text(
            bar.get_x() + bar.get_width()/2, height,
            int(height), ha='center', va='bottom',
            fontsize=10, fontweight='bold'
        )
    
    fig.suptitle(
        f"Data Types Analysis ({len(dtype_df)} Columns)",
        fontsize=16, fontweight='bold', y=0.98
    )
    
    plt.tight_layout()
    plt.savefig('data_types_analysis.png', dpi=300, bbox_inches='tight', facecolor='white')
    print("\n✓ Saved: data_types_analysis.png")
    plt.close()

visualize_data_types(dtype_info, dtype_distribution)


✓ Saved: data_types_analysis.png


In [9]:
# =====================================================
# 9. DATA CLEANING (IF NEEDED)
# =====================================================

print("\n" + "="*60)
print("DATA CLEANING")
print("="*60)

# Since this dataset has no missing values, we'll check for other issues

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

if duplicates > 0:
    df = df.drop_duplicates()
    print(f"✓ Removed {duplicates} duplicate rows")

# Check for any data quality issues
# Example: Check if TotalCharges is object type (should be numeric)
if df['TotalCharges'].dtype == 'object':
    print("\n⚠ Found 'TotalCharges' as object type - converting to numeric")
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    print("✓ Converted 'TotalCharges' to numeric")
    
    # Fill any NaN created during conversion
    if df['TotalCharges'].isnull().any():
        missing_count = df['TotalCharges'].isnull().sum()
        df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
        print(f"✓ Filled {missing_count} missing values in 'TotalCharges' with median")

print("\n✓ Data cleaning completed!")



DATA CLEANING
Duplicate rows: 0

⚠ Found 'TotalCharges' as object type - converting to numeric
✓ Converted 'TotalCharges' to numeric
✓ Filled 11 missing values in 'TotalCharges' with median

✓ Data cleaning completed!


In [10]:
# =====================================================
# 10. SAVE CLEANED DATA
# =====================================================

print("\n" + "="*60)
print("SAVING CLEANED DATA")
print("="*60)

df.to_csv('telco_customer_churn_cleaned.csv', index=False)
print("✓ Saved: telco_customer_churn_cleaned.csv")

# =====================================================
# 11. FINAL SUMMARY
# =====================================================

print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Original Shape    : {df_original.shape}")
print(f"Cleaned Shape     : {df.shape}")
print(f"Rows Removed      : {df_original.shape[0] - df.shape[0]}")
print(f"Columns Removed   : {df_original.shape[1] - df.shape[1]}")
print(f"Missing Values    : {df.isnull().sum().sum()}")
print(f"Duplicate Rows    : {df.duplicated().sum()}")
print(f"\n✓ Data analysis and cleaning completed successfully!")
print("="*60)

print("\n" + "="*60)
print("FILES CREATED")
print("="*60)
print("1. first_5_rows.png              - Visual preview of data")
print("2. column_names.png              - List of all columns")
print("3. missing_values_analysis.png   - Missing values report")
print("4. data_types_analysis.png       - Data types distribution")
print("5. telco_customer_churn_cleaned.csv - Cleaned dataset")
print("="*60)


SAVING CLEANED DATA
✓ Saved: telco_customer_churn_cleaned.csv

FINAL SUMMARY
Original Shape    : (7043, 21)
Cleaned Shape     : (7043, 21)
Rows Removed      : 0
Columns Removed   : 0
Missing Values    : 0
Duplicate Rows    : 0

✓ Data analysis and cleaning completed successfully!

FILES CREATED
1. first_5_rows.png              - Visual preview of data
2. column_names.png              - List of all columns
3. missing_values_analysis.png   - Missing values report
4. data_types_analysis.png       - Data types distribution
5. telco_customer_churn_cleaned.csv - Cleaned dataset
