In [2]:
# =========================
# TELCO CUSTOMER CHURN MODEL
# =========================

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load dataset
df = pd.read_csv('Telco-Customer-Churn.csv')

# Step 2: Drop unwanted columns
df.drop(columns=['customerID', 'Customer Churn Counter', 'Total counter', 'Tenure In years'], inplace=True, errors='ignore')

# Step 3: Clean TotalCharges (convert blank strings to numeric)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Step 4: Separate features and target
X = df.drop(columns='Churn')
Y = df['Churn'].map({'Yes': 1, 'No': 0})

# Step 5: Define categorical and numeric columns
cat_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
            'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
            'PaperlessBilling', 'PaymentMethod']

num_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

# Step 6: Create preprocessing pipeline
onehot = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
scaler = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat', onehot, cat_cols),
    ('num', scaler, num_cols)
])

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Step 8: Fit preprocessor and transform data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

# # Step 9: Train Logistic Regression model
# Lr = LogisticRegression(max_iter=500)
# Lr.fit(X_train_prep, y_train)

# # Step 10: Evaluate the model
# y_train_pred = Lr.predict(X_train_prep)
# y_test_pred = Lr.predict(X_test_prep)
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train_prep, y_train)
y_train_pred = xgb.predict(X_train_prep)
y_test_pred = xgb.predict(X_test_prep)
# Train & Test Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print("üéØ Train Accuracy:", round(train_acc, 4))
print("üß† Test Accuracy:", round(test_acc, 4))

# Confusion Matrix & Classification Report
print("\nüìä Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nüìà Classification Report:\n", classification_report(y_test, y_test_pred))

# Step 11: Save model and preprocessor
joblib.dump(xgb, "telecom_churn_model.joblib")
joblib.dump(preprocessor, "telecom_preprocessor.joblib")

print("\nüíæ Model and preprocessor saved successfully!")

# Step 12: Test with new data
model = joblib.load("telecom_churn_model.joblib")
preprocessor_loaded = joblib.load("telecom_preprocessor.joblib")

sample = pd.DataFrame([{
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 20,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'Yes',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 75.35,
    'TotalCharges': 1500.50
}])

sample_encoded = preprocessor_loaded.transform(sample)
prediction = model.predict(sample_encoded)[0]
label = "Yes (Will Churn)" if prediction == 1 else "No (Will Stay)"
print("\nüîç Prediction for Sample Customer:", label)


üéØ Train Accuracy: 0.8694
üß† Test Accuracy: 0.7999

üìä Confusion Matrix:
 [[927 109]
 [173 200]]

üìà Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.87      1036
           1       0.65      0.54      0.59       373

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409


üíæ Model and preprocessor saved successfully!

üîç Prediction for Sample Customer: No (Will Stay)


In [3]:
# Step 1: Load dataset
df = pd.read_csv('Telco-Customer-Churn.csv')

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure In years,Customer Churn Counter,Total counter
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0,0,1
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,One year,No,Mailed check,56.95,1889.5,No,3,0,1
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0,1,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,4,0,1
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0,1,1


In [5]:
# Step 2: Drop unwanted columns
df.drop(columns=['customerID', 'Customer Churn Counter', 'Total counter', 'Tenure In years'], inplace=True, errors='ignore')


In [7]:
 #Step 3: Clean TotalCharges (convert blank strings to numeric)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())


In [8]:
# Step 4: Separate features and target
X = df.drop(columns='Churn')
Y = df['Churn'].map({'Yes': 1, 'No': 0})


In [9]:
# Step 5: Define categorical and numeric columns
cat_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
            'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
            'PaperlessBilling', 'PaymentMethod']

num_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [10]:
# Step 6: Create preprocessing pipeline
onehot = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
scaler = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat', onehot, cat_cols),
    ('num', scaler, num_cols)
])

In [11]:
# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [12]:
# Step 8: Fit preprocessor and transform data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)


In [13]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train_prep, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [14]:
y_train_pred = xgb.predict(X_train_prep)
y_test_pred = xgb.predict(X_test_prep)

In [15]:
# Train & Test Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print("üéØ Train Accuracy:", round(train_acc, 4))
print("üß† Test Accuracy:", round(test_acc, 4))


üéØ Train Accuracy: 0.8694
üß† Test Accuracy: 0.7999


In [16]:
# Confusion Matrix & Classification Report
print("\nüìä Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nüìà Classification Report:\n", classification_report(y_test, y_test_pred))



üìä Confusion Matrix:
 [[927 109]
 [173 200]]

üìà Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.87      1036
           1       0.65      0.54      0.59       373

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409



In [17]:
# Step 11: Save model and preprocessor
joblib.dump(xgb, "telecom_churn_model.joblib")
joblib.dump(preprocessor, "telecom_preprocessor.joblib")

print("\n Model and preprocessor saved successfully!")



üíæ Model and preprocessor saved successfully!


In [18]:
model = joblib.load("telecom_churn_model.joblib")
preprocessor_loaded = joblib.load("telecom_preprocessor.joblib")


In [19]:
# Step 12: Test with new data

sample = pd.DataFrame([{
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 20,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'Yes',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 75.35,
    'TotalCharges': 1500.50
}])

sample_encoded = preprocessor_loaded.transform(sample)
prediction = model.predict(sample_encoded)[0]
label = "Yes (Will Churn)" if prediction == 1 else "No (Will Stay)"
print("\nüîç Prediction for Sample Customer:", label)


üîç Prediction for Sample Customer: No (Will Stay)
