# Customer Churn Analysis â€” Keras Models (Jupyter Notebook)
**Purpose:** End-to-end churn prediction project: data cleaning, EDA, visualizations, and three Keras models (including dropout and multi-feature).
**Instructions:** Upload your dataset to the same folder and set the `DATA_PATH` variable in the first code cell. The notebook will run on a standard Python 3 environment with pandas, scikit-learn, matplotlib, seaborn, and tensorflow installed.
Date: 27 November 2025


In [None]:
# === Setup: point DATA_PATH to your dataset (CSV or Excel) ===
# Example: DATA_PATH = 'data/customer_churn.csv'
DATA_PATH = 'customer_churn.csv'  # <-- change this if your file has a different name or path
# If you uploaded an Excel file, set DATA_PATH = 'customer_churn.xlsx' and the code will handle it.

# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
import os

print('Python version:', sys.version if 'sys' in globals() else 'unknown') if False else None


In [None]:
# === Load dataset (auto-detect CSV / Excel) ===
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Upload your dataset or change DATA_PATH.")

if DATA_PATH.lower().endswith('.csv'):
    df = pd.read_csv(DATA_PATH)
elif DATA_PATH.lower().endswith(('.xls', '.xlsx')):
    df = pd.read_excel(DATA_PATH)
else:
    # Try CSV, then Excel
    try:
        df = pd.read_csv(DATA_PATH)
    except Exception as e:
        df = pd.read_excel(DATA_PATH)

print('Dataset shape:', df.shape)
display(df.head())

# Show column names and dtypes
print('\nColumn dtypes:')
print(df.dtypes)


In [None]:
# === Basic cleaning ===
# Normalize column names (optional)
df.columns = [c.strip() for c in df.columns]

# Ensure TotalCharges numeric
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    n_missing = df['TotalCharges'].isna().sum()
    print('TotalCharges - coerced to numeric. Missing values:', n_missing)
    # Fill or drop - here we fill with 0 (you can choose median or drop)
    df['TotalCharges'].fillna(0, inplace=True)

# Quick look at target column
if 'Churn' in df.columns:
    print('\nChurn value counts:')
    print(df['Churn'].value_counts())
else:
    print('\nWARNING: No column named "Churn" found. Please ensure target column exists.')

In [None]:
# === A) Data Manipulation tasks ===
# A.a - Total number of male customers
if 'gender' in df.columns:
    male_count = df[df['gender'].str.lower() == 'male'].shape[0]
    print('Total male customers:', male_count)
else:
    print('Column "gender" not found.')

# A.b - Total number of customers whose InternetService is DSL
if 'InternetService' in df.columns:
    dsl_count = df[df['InternetService'].str.lower() == 'dsl'].shape[0]
    print('Customers with DSL InternetService:', dsl_count)
else:
    print('Column "InternetService" not found.')

# A.c - Female senior citizens whose PaymentMethod is Mailed check
new_customer_female_senior_mailed = pd.DataFrame()
if set(['gender','SeniorCitizen','PaymentMethod']).issubset(df.columns):
    new_customer_female_senior_mailed = df[(df['gender'].str.lower()=='female') & (df['SeniorCitizen']==1) & (df['PaymentMethod'].str.lower()=='mailed check')]
    print('Female senior citizens with Mailed check:', new_customer_female_senior_mailed.shape[0])
    display(new_customer_female_senior_mailed.head())
else:
    print('One of the columns gender/SeniorCitizen/PaymentMethod not found.')

# A.d - tenure < 10 OR TotalCharges < 500
new_customer_tenure_total = pd.DataFrame()
if 'tenure' in df.columns and 'TotalCharges' in df.columns:
    new_customer_tenure_total = df[(df['tenure'] < 10) | (df['TotalCharges'] < 500)]
    print('Customers with tenure<10 OR TotalCharges<500:', new_customer_tenure_total.shape[0])
    display(new_customer_tenure_total.head())
else:
    print('Columns tenure and/or TotalCharges not found.')

In [None]:
# === B) Visualizations ===
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

# B.a - Pie chart: Churn distribution
if 'Churn' in df.columns:
    plt.figure(figsize=(6,6))
    df['Churn'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90)
    plt.title('Churn Distribution')
    plt.ylabel('')
    plt.show()
else:
    print('Column Churn not found - cannot plot churn distribution.')

# B.b - Bar plot: InternetService distribution
if 'InternetService' in df.columns:
    plt.figure(figsize=(8,4))
    sns.countplot(x='InternetService', data=df, order=df['InternetService'].value_counts().index)
    plt.title('Internet Service Distribution')
    plt.xticks(rotation=20)
    plt.show()
else:
    print('InternetService column not found - cannot plot distribution.')

In [None]:
# === Preprocessing for Modeling ===
# Encode target
if 'Churn' in df.columns:
    le = LabelEncoder()
    df['Churn_enc'] = le.fit_transform(df['Churn'])
    print('Classes (Churn):', le.classes_)
else:
    raise ValueError('Target column Churn not found.')

# Prepare a helper function to split and scale
def prepare_xy(features):
    X = df[features].copy()
    y = df['Churn_enc'].copy()
    # If categorical columns present, do simple encoding (one-hot)
    X = pd.get_dummies(X, drop_first=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [None]:
# === C.a Model 1: tenure -> Churn ===
features = ['tenure']
if not set(features).issubset(df.columns):
    raise ValueError('Required features for Model 1 not found in dataframe: ' + str(features))

X_train, X_test, y_train, y_test = prepare_xy(features)

model1 = Sequential([
    Dense(12, input_dim=X_train.shape[1], activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history1 = model1.fit(X_train, y_train, epochs=150, validation_split=0.2, verbose=1)

# Predictions and metrics
y_pred1 = (model1.predict(X_test) > 0.5).astype('int32')
print('Confusion Matrix Model 1:')
print(confusion_matrix(y_test, y_pred1))
print('\nClassification Report:')
print(classification_report(y_test, y_pred1))

In [None]:
# Accuracy vs Epochs - Model 1
plt.figure(figsize=(8,4))
plt.plot(history1.history['accuracy'], label='train_acc')
plt.plot(history1.history.get('val_accuracy', []), label='val_acc')
plt.title('Model 1 - Accuracy vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# === C.b Model 2: with Dropout ===
features = ['tenure']
X_train, X_test, y_train, y_test = prepare_xy(features)

model2 = Sequential([
    Dense(12, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(8, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history2 = model2.fit(X_train, y_train, epochs=150, validation_split=0.2, verbose=1)

y_pred2 = (model2.predict(X_test) > 0.5).astype('int32')
print('Confusion Matrix Model 2:')
print(confusion_matrix(y_test, y_pred2))
print('\nClassification Report:')
print(classification_report(y_test, y_pred2))

In [None]:
# Accuracy vs Epochs - Model 2
plt.figure(figsize=(8,4))
plt.plot(history2.history['accuracy'], label='train_acc')
plt.plot(history2.history.get('val_accuracy', []), label='val_acc')
plt.title('Model 2 - Accuracy vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# === C.c Model 3: tenure, MonthlyCharges, TotalCharges ===
features = ['tenure','MonthlyCharges','TotalCharges']
if not set(features).issubset(df.columns):
    missing = set(features) - set(df.columns)
    raise ValueError('Required features for Model 3 missing: ' + str(missing))

X_train, X_test, y_train, y_test = prepare_xy(features)

model3 = Sequential([
    Dense(12, input_dim=X_train.shape[1], activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history3 = model3.fit(X_train, y_train, epochs=150, validation_split=0.2, verbose=1)

y_pred3 = (model3.predict(X_test) > 0.5).astype('int32')
print('Confusion Matrix Model 3:')
print(confusion_matrix(y_test, y_pred3))
print('\nClassification Report:')
print(classification_report(y_test, y_pred3))

In [None]:
# Accuracy vs Epochs - Model 3
plt.figure(figsize=(8,4))
plt.plot(history3.history['accuracy'], label='train_acc')
plt.plot(history3.history.get('val_accuracy', []), label='val_acc')
plt.title('Model 3 - Accuracy vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# === Save trained models and scaler ===
# Creates a 'models' folder and saves models if they exist in the notebook session.
os.makedirs('models', exist_ok=True)
try:
    model1.save('models/model1.h5')
    model2.save('models/model2.h5')
    model3.save('models/model3.h5')
    print('Models saved to /models/*.h5')
except Exception as e:
    print('Could not save models:', e)

# === Closing Notes ===
# - Change DATA_PATH to point to your dataset before running.
# - If you want a lighter run, reduce epochs (e.g., 20) while experimenting.
# - Consider stratified splitting if class imbalance exists.
# - To export this notebook to PDF, use File -> Download as -> PDF (or nbconvert).