In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df
df['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [3]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
# plt.figure(figsize=(20, 5))

# plt.subplot(1, 4, 1)
# sns.histplot(df['tenure'], kde=True)
# plt.title('tenure')

# plt.subplot(1, 4, 2)
# sns.histplot(df['MonthlyCharges'], kde=True)
# plt.title('MonthlyCharges')

# plt.subplot(1, 4, 3)
# sns.histplot(df['TotalCharges'], kde=True)
# plt.title('TotalCharges')

# plt.tight_layout()
# plt.show()

In [5]:
df = df.drop(['customerID'], axis=1)

In [6]:
# Replace empty strings in 'TotalCharges' with NaN
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

# Drop rows with any missing values
df = df.dropna()

In [7]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [8]:
num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
cat_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'DeviceProtection', 'TechSupport',	'StreamingTV',	'StreamingMovies',	'Contract',	'PaperlessBilling',	'PaymentMethod']
numeric_transformer = Pipeline(steps=[
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
])

In [9]:
X_encoded = preprocessor.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [11]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
class_weight_dict = {0: 1.0, 1: 5.0}

In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), class_weight=class_weight_dict, epochs=100, batch_size=32, callbacks=[early_stop], verbose=1)

Epoch 1/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6021 - loss: 1.0965 - val_accuracy: 0.6425 - val_loss: 0.6045
Epoch 2/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6700 - loss: 0.9713 - val_accuracy: 0.7114 - val_loss: 0.5610
Epoch 3/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6942 - loss: 0.9673 - val_accuracy: 0.6553 - val_loss: 0.6504
Epoch 4/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6738 - loss: 0.9581 - val_accuracy: 0.6496 - val_loss: 0.6773
Epoch 5/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6748 - loss: 0.9314 - val_accuracy: 0.6397 - val_loss: 0.6252
Epoch 6/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6777 - loss: 0.9148 - val_accuracy: 0.6688 - val_loss: 0.5608
Epoch 7/100
[1m176/17

In [None]:
y_pred = (model.predict(X_test) > 0.6).astype(int)
print("Validation Accuracy:", accuracy_score(y_test, y_pred))

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Validation Accuracy: 0.7327647476901208


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.89      0.72      0.80      1033
           1       0.50      0.76      0.60       374

    accuracy                           0.73      1407
   macro avg       0.70      0.74      0.70      1407
weighted avg       0.79      0.73      0.75      1407

ROC AUC: 0.7429531347873127


In [None]:
import joblib
from tensorflow.keras.models import save_model

# --- Add this code after your model.fit() line ---

# 1. Save the scikit-learn preprocessor
# We use joblib as it's efficient for objects with large numpy arrays
joblib.dump(preprocessor, 'preprocessor.pkl')

# 2. Save the trained Keras neural network
# The recommended format is .h5 or .keras, which saves the architecture, weights, and optimizer
model.save('churn_model.h5')

print("Preprocessor saved to preprocessor.pkl")
print("Model saved to churn_model.h5")