In [1]:
import pandas as pd
import os

# Update this with your actual file name
file_path = r'C:\Users\towhi\Downloads\customer_churn.csv'

# Check if file exists
if os.path.exists(file_path):
    data = pd.read_csv(file_path)
    print("✅ File loaded successfully.")
    print(data.head())  # Show the first few rows
else:
    print("❌ File not found. Please check the file name and path.")


✅ File loaded successfully.
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Streamin

DATA Preprocessing

In [2]:
# View general info
print(data.info())

# Check for missing values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
data_encoded = pd.get_dummies(data, drop_first=True)


In [8]:
X = data_encoded.drop('Churn_Yes', axis=1)
y = data_encoded['Churn_Yes']


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# Train Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train model
lr = LogisticRegression(max_iter=1000)  # Increased max_iter to avoid convergence issues
lr.fit(X_train, y_train)

# Predict
y_pred_lr = lr.predict(X_test)

# Evaluate
print("📊 Logistic Regression:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC Score:", roc_auc_score(y_test, lr.predict_proba(X_test)[:,1]))


📊 Logistic Regression:
[[1026   10]
 [ 336   37]]
              precision    recall  f1-score   support

           0       0.75      0.99      0.86      1036
           1       0.79      0.10      0.18       373

    accuracy                           0.75      1409
   macro avg       0.77      0.54      0.52      1409
weighted avg       0.76      0.75      0.68      1409

ROC-AUC Score: 0.8449969981471321


# Train Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

# Train model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_test)

# Evaluate
print("\n🌲 Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))



🌲 Random Forest:
[[963  73]
 [204 169]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.87      1036
           1       0.70      0.45      0.55       373

    accuracy                           0.80      1409
   macro avg       0.76      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409

ROC-AUC Score: 0.8562875878559525


# Random Forest Tuning with GridSearchCV

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Minimal grid to avoid memory issues
param_grid = {
    'n_estimators': [100],          # Only 1 value
    'max_depth': [None, 10],        # Try None and a limited depth
    'min_samples_split': [2]        # Keep default splitting
}

# Use fewer folds (3) and only 1 CPU core
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=1
)

# Fit the model
grid_search.fit(X_train, y_train)



# Evaluate the Tuned Random Forest

In [15]:
# Get the best model from search
best_rf = grid_search.best_estimator_

# Predict on test set
y_pred_best_rf = best_rf.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("\n🌲 Tuned Random Forest Results:")
print(confusion_matrix(y_test, y_pred_best_rf))
print(classification_report(y_test, y_pred_best_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, best_rf.predict_proba(X_test)[:,1]))

# Print the best parameters found
print("✅ Best Parameters:", grid_search.best_params_)



🌲 Tuned Random Forest Results:
[[963  73]
 [204 169]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.87      1036
           1       0.70      0.45      0.55       373

    accuracy                           0.80      1409
   macro avg       0.76      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409

ROC-AUC Score: 0.8562875878559525
✅ Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


# Conclusion

This project successfully implemented a machine learning pipeline to predict customer churn based on customer data. Two models — Logistic Regression and Random Forest — were trained and evaluated.

Random Forest outperformed Logistic Regression across all key metrics, including accuracy, recall, F1-score, and ROC-AUC. Further improvement was achieved through hyperparameter tuning, using a simplified grid to manage memory limitations.

The final tuned Random Forest model showed strong performance and is well-suited for deployment in a real-world business setting to identify at-risk customers and reduce churn.