In [14]:
## Step 1: Install and Import Dependencies
!pip install --upgrade pip setuptools wheel
!pip install --no-cache-dir pandas numpy scikit-learn seaborn matplotlib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [15]:

## Step 2: Load Dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/dataset_customer_churn/Churn_Modelling.csv"  # Update with actual file path
df = pd.read_csv(file_path)

# Inspect the first few rows
print(df.head())


   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [25]:
## Step 3: Data Preprocessing
# Check for missing values
print(df.isnull().sum())

# Display column names to verify expected columns
print("Columns in dataset:", df.columns)

# Preserve Customer ID and Exited status for later reference
if 'Exited' in df.columns:
    customer_info = df[['CustomerId', 'Exited']]
else:
    print("Warning: 'Exited' column not found! Ensure dataset includes target variable.")
    customer_info = None

# Identify categorical columns and apply one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Ensure the required numerical columns exist before scaling
num_cols = ['CreditScore', 'Balance', 'EstimatedSalary']
for col in num_cols:
    if col in df.columns:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(df[[col]])
    else:
        print(f"Column '{col}' not found in dataset. Please check the file.")

# Select features and target variable
if 'Exited' in df.columns:
    X = df.drop(columns=['Exited', 'CustomerId'])  # Features
    y = df['Exited']  # Target (0 = Retained, 1 = Churned)
else:
    print("Error: 'Exited' column missing after encoding! Check dataset preprocessing.")
    X, y = None, None

if y is not None:
    # Split dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    customer_test_info = customer_info.iloc[y_test.index] if customer_info is not None else None


RowNumber            0
CustomerId           0
CreditScore          0
Age                  0
Tenure               0
                    ..
Surname_Zuyev        0
Surname_Zuyeva       0
Geography_Germany    0
Geography_Spain      0
Gender_Male          0
Length: 2945, dtype: int64
Columns in dataset: Index(['RowNumber', 'CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       ...
       'Surname_Zotova', 'Surname_Zox', 'Surname_Zubarev', 'Surname_Zubareva',
       'Surname_Zuev', 'Surname_Zuyev', 'Surname_Zuyeva', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object', length=2945)


In [26]:
    ## Step 4: Train Models
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=100)
    }

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"Accuracy ({name}):", accuracy_score(y_test, y_pred))
        print(f"Classification Report ({name}):\n", classification_report(y_test, y_pred))
        print(f"Confusion Matrix ({name}):\n", confusion_matrix(y_test, y_pred))


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy (Logistic Regression): 0.8195
Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.83      0.97      0.90      1607
           1       0.63      0.20      0.30       393

    accuracy                           0.82      2000
   macro avg       0.73      0.58      0.60      2000
weighted avg       0.79      0.82      0.78      2000

Confusion Matrix (Logistic Regression):
 [[1561   46]
 [ 315   78]]
Training Decision Tree...
Accuracy (Decision Tree): 0.82
Classification Report (Decision Tree):
               precision    recall  f1-score   support

           0       0.89      0.89      0.89      1607
           1       0.54      0.53      0.54       393

    accuracy                           0.82      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.82      0.82      0.82      2000

Confusion Matrix (Decision Tree):
 [[1431  176]
 [ 184  209]]
Training Random Forest...
Accuracy

In [28]:

 ## Step 5: Display Customer Retention Results
print("\nCustomer Retention Predictions:")
if customer_test_info is not None:
    for index, row in customer_test_info.iterrows():
            customer_id = row['CustomerId']
            actual_status = "Churned" if row['Exited'] == 1 else "Retained"
            print(f"Customer ID: {customer_id}, Actual Status: {actual_status}")
    else:
        print("No customer information available for testing.")
else:
    print("Dataset processing failed. Please check for missing target variable.")



Customer Retention Predictions:
Customer ID: 15687492, Actual Status: Retained
Customer ID: 15736963, Actual Status: Retained
Customer ID: 15721730, Actual Status: Retained
Customer ID: 15762134, Actual Status: Retained
Customer ID: 15648898, Actual Status: Retained
Customer ID: 15659064, Actual Status: Retained
Customer ID: 15761986, Actual Status: Retained
Customer ID: 15713354, Actual Status: Churned
Customer ID: 15593454, Actual Status: Retained
Customer ID: 15690134, Actual Status: Retained
Customer ID: 15767474, Actual Status: Churned
Customer ID: 15785367, Actual Status: Churned
Customer ID: 15587507, Actual Status: Churned
Customer ID: 15569764, Actual Status: Retained
Customer ID: 15665062, Actual Status: Retained
Customer ID: 15759966, Actual Status: Retained
Customer ID: 15576352, Actual Status: Retained
Customer ID: 15713604, Actual Status: Retained
Customer ID: 15596797, Actual Status: Retained
Customer ID: 15744127, Actual Status: Retained
Customer ID: 15589420, Actual S