In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------
# Load cleaned dataset (relative path)
# ------------------------------
CLEAN_PATH = "data/processed/Telco-Customer-Churn-Clean.csv"
df = pd.read_csv(CLEAN_PATH)

df.head()

# ------------------------------
# Basic dataset overview
# ------------------------------
print("Shape:", df.shape)
df.describe(include='all')


In [None]:
# =====================================================
# 1. OVERALL CHURN RATE
# =====================================================
churn_rate = df['Churn'].value_counts(normalize=True) * 100
print("Churn rate (%):")
print(churn_rate)

# Interpretation:
# → Churn ~26%, quite high for a subscription model.
# → Strong reason to focus on retention strategy.



In [None]:
# =====================================================
# 2. CHURN BY CONTRACT TYPE
# =====================================================
churn_by_contract = (
    df.groupby("Contract")["Churn"]
      .value_counts(normalize=True)
      .rename("Rate")
      .mul(100)
      .reset_index()
)

churn_by_contract

# My observation:
# → Month-to-month customers churn heavily.
# → One-year and especially two-year contracts show strong retention.



In [None]:
# -----------------------------------------------------
# 3. PLOT A — Overall Churn Count
# -----------------------------------------------------
plt.figure(figsize=(5,4))
df['Churn'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title("Overall Churn Count")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Insight:
# → Churn volume is significant (~1,800 customers).
# → Dataset is imbalanced (important for modeling).



In [None]:
# -----------------------------------------------------
# 4. PLOT B — Churn Rate by Contract Type
# -----------------------------------------------------
plt.figure(figsize=(7,5))
sns.barplot(
    data=churn_by_contract,
    x="Contract",
    y="Rate",
    hue="Churn",
    palette="husl"
)
plt.title("Churn Rate by Contract Type")
plt.ylabel("Churn Rate (%)")
plt.tight_layout()
plt.show()

# Insight:
# → Contract type is one of the strongest churn predictors.
# → Month-to-month churn is extremely high.



In [None]:
# -----------------------------------------------------
# 5. PLOT C — Distribution of Monthly Charges
# -----------------------------------------------------
plt.figure(figsize=(7,5))
sns.histplot(df['MonthlyCharges'], bins=30, kde=True)
plt.title("Distribution of Monthly Charges")
plt.xlabel("Monthly Charges")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Insight:
# → Two pricing groups visible: low-cost and high-cost plan clusters.
# → Higher-charge clusters often correlate with churn (explore in Tableau).



In [None]:
# -----------------------------------------------------
# 6. SUMMARY INSIGHTS
# -----------------------------------------------------
print("\n--- KEY EARLY INSIGHTS ---")
print(f"Overall churn rate: {round(churn_rate['Yes'], 2)}%")
print("- Highest churn among month-to-month customers")
print("- Higher monthly charges correlate with churn risk")
print("- Low-tenure customers likely churn more (to explore in Tableau)")
