In [4]:
# 1. Load required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib



In [7]:

# 2. Load dataset from correct local path
file_path = r"WA_Fn-UseC_-Telco-Customer-Churn.csv"
data = pd.read_csv(file_path)


In [8]:


# 3. Data Cleaning and Preprocessing
data = data.drop('customerID', axis=1)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].median())
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})


In [9]:

# 4. Train/Test Split
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [14]:
# 5. Preprocessing Pipelines
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = list(set(X.columns) - set(numeric_features))

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [15]:

# 6. Define reusable pipeline function
def build_pipeline(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])


In [16]:


# 7. Create model pipelines
logreg_pipeline = build_pipeline(LogisticRegression(max_iter=1000))
rf_pipeline = build_pipeline(RandomForestClassifier(random_state=42))


In [17]:

# 8. Define hyperparameter grids
logreg_params = {
    'classifier__C': [0.1, 1, 10]
}

rf_params = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}


In [18]:


# 9. GridSearch for Logistic Regression
print("Tuning Logistic Regression...")
grid_logreg = GridSearchCV(logreg_pipeline, logreg_params, cv=5, scoring='accuracy')
grid_logreg.fit(X_train, y_train)


Tuning Logistic Regression...
Tuning Random Forest...


In [19]:

# 10. GridSearch for Random Forest
print("Tuning Random Forest...")
grid_rf = GridSearchCV(rf_pipeline, rf_params, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)



Best Logistic Regression Model:
{'classifier__C': 10}
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.69      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.82      1409


Best Random Forest Model:
{'classifier__max_depth': 10, 'classifier__n_estimators': 100}
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1036
           1       0.67      0.52      0.59       373

    accuracy                           0.81      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.80      0.81      0.80      1409



In [20]:

# 11. Evaluation
print("\nBest Logistic Regression Model:")
print(grid_logreg.best_params_)
y_pred_logreg = grid_logreg.predict(X_test)
print(classification_report(y_test, y_pred_logreg))

print("\nBest Random Forest Model:")
print(grid_rf.best_params_)
y_pred_rf = grid_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))


✅ Model saved successfully!


In [None]:

# 12. Save Best Model
joblib.dump(grid_rf.best_estimator_, "best_telco_churn_model.joblib")
print(" Model saved successfully as 'best_telco_churn_model.joblib'")






<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>