In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib # For saving our model
import os # To handle file paths

In [None]:
DATA_PATH = os.path.join("..", "data", "my_data.csv")


df = pd.read_csv(DATA_PATH)


print("Dataset loaded successfully. Here are the first 5 rows:")
df.head()

Dataset loaded successfully. Here are the first 5 rows:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
# Cell 3: Data Cleaning and Preprocessing
# Drop the customerID column as it's a unique identifier and not useful for prediction
df.drop('customerID', axis=1, inplace=True)

# Convert 'TotalCharges' to a numeric type. Some values might be spaces,
# so 'errors='coerce'' will turn them into 'NaN' (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill any 'NaN' values in 'TotalCharges' with the column's median.
# The median is robust to outliers, making it a good choice for imputation.
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Convert the target variable 'Churn' into a binary format (1 for 'Yes', 0 for 'No')
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

print("Data cleaning complete.")
df.info() # Check for missing values and data types

Data cleaning complete.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [5]:
X = df.drop('Churn', axis=1)
# 'y' contains only the 'Churn' column
y = df['Churn']

# Identify which columns are numerical and which are categorical
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=object).columns.tolist()

print(f"\nNumerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")


Numerical features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set has {X_train.shape[0]} samples.")
print(f"Testing set has {X_test.shape[0]} samples.")


Training set has 5634 samples.
Testing set has 1409 samples.


In [None]:
# Cell 6: Create the Preprocessing Pipeline
# This pipeline defines the transformations for our data before it reaches the model.

# Create a transformer for numerical features: scales data to have a mean of 0 and variance of 1.
numerical_transformer = StandardScaler()

# Create a transformer for categorical features: converts text categories into numerical format.
# handle_unknown='ignore' prevents errors if the test set has categories not seen in the training set.
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Use ColumnTransformer to apply the correct transformation to the correct columns.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', GradientBoostingClassifier(random_state=42))])

# Train the model on the training data
print("--- Training Gradient Boosting Model ---")
gb_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb_pipeline.predict(X_test)

print("\n--- Initial Gradient Boosting Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

--- Training Gradient Boosting Model ---

--- Initial Gradient Boosting Model Evaluation ---
Accuracy: 0.8062
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.52      0.59       374

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.73      1409
weighted avg       0.80      0.81      0.80      1409



In [9]:
# Cell 8: Hyperparameter Tuning with GridSearchCV
# ====================================================================
# Define the parameter grid to search through for the best model settings.
# Keys are named '<step_name>__<parameter_name>'
param_grid = {
    'classifier__n_estimators': [100, 200],      # Number of decision trees
    'classifier__learning_rate': [0.05, 0.1],   # Step size shrinkage
    'classifier__max_depth': [3, 4]             # Maximum depth of a tree
}

# Create the GridSearchCV object to automate the tuning process
# cv=3 means we use 3-fold cross-validation.
grid_search = GridSearchCV(gb_pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)

print("\n--- Starting Hyperparameter Tuning ---")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters found: {grid_search.best_params_}")


--- Starting Hyperparameter Tuning ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Best parameters found: {'classifier__learning_rate': 0.05, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}


In [None]:
# Cell 9: Evaluate the Final Tuned Model
# 'grid_search.best_estimator_' is the pipeline with the best found parameters.
final_model = grid_search.best_estimator_
y_pred_tuned = final_model.predict(X_test)

print("\n--- Final Tuned Model Evaluation ---")
print(f"Tuned Accuracy: {accuracy_score(y_test, y_pred_tuned):.4f}")
print(classification_report(y_test, y_pred_tuned))


--- Final Tuned Model Evaluation ---
Tuned Accuracy: 0.8034
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.51      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [None]:
# Cell 10: Save the Final Model
# Define the path to save the model file.
MODEL_DIR = os.path.join("..", "models")
MODEL_PATH = os.path.join(MODEL_DIR, "churn_prediction_model.joblib")

# Create the 'models' directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)

# Save the entire pipeline object
joblib.dump(final_model, MODEL_PATH)

print(f"\nModel saved successfully at: {MODEL_PATH}")


Model saved successfully at: ..\models\churn_prediction_model.joblib
