Base on the data from 
https://www.kaggle.com/datasets/anandshaw2001/customer-churn-dataset/data provided by anandshaw2001, licenced CC0.



In [42]:
pip install faker

Note: you may need to restart the kernel to use updated packages.


In [43]:
#librairies imporation
import pandas as pd
import numpy as np
import sklearn
from faker import Faker

# Data Exploration

In [44]:
#Opening csv file:

file = pd.read_csv("Churn_Modelling.csv")
data = file.copy()
fake = Faker()
# Generate random names for each row in the 'Surname' column
data['Surname'] = data['Surname'].apply(lambda _: fake.last_name())
data.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Martinez,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Butler,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Ramirez,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Bean,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Turner,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


RowNumber: A unique identifier for each row in the dataset.

CustomerId: Unique customer identification number.

Surname: The last name of the customer (for privacy reasons, consider anonymizing this data if not already done).

CreditScore: The customer's credit score at the time of data collection.

Geography: The customer's country or region, providing insights into location-based trends in churn.

Gender: The customer's gender.

Age: The customer's age, valuable for demographic analysis.

Tenure: The number of years the customer has been with the bank.

Balance: The customer's account balance.

NumOfProducts: The number of products the customer has purchased or subscribed to.

HasCrCard: Indicates whether the customer has a credit card (1) or not (0).

IsActiveMember: Indicates whether the customer is an active member (1) or not (0).

EstimatedSalary: The customer's estimated salary.

Exited: The target variable, indicating whether the customer has churned (1) or not (0).

In [45]:
mean_credit_score_countries = data.groupby("Geography")["CreditScore"].mean()
mean_credit_score_gender = data.groupby("Gender")["CreditScore"].mean()
mean_credit_score_age = data.groupby("Age")["CreditScore"].mean()
print(mean_credit_score_countries, 
      mean_credit_score_gender,
      mean_credit_score_age,
      sep = "\n\n")

Geography
France     649.668329
Germany    651.453567
Spain      651.333872
Name: CreditScore, dtype: float64

Gender
Female    650.831389
Male      650.276892
Name: CreditScore, dtype: float64

Age
18    686.681818
19    665.629630
20    665.050000
21    647.283019
22    636.547619
         ...    
83    678.000000
84    472.500000
85    787.000000
88    513.000000
92    729.000000
Name: CreditScore, Length: 70, dtype: float64


In [46]:
#Creation of group by age
# Define age group boundaries
age_group_boundaries = [18, 25, 40, 55, 70, 100]
age_group_labels = ['18-25', '26-40', '41-55', '56-70', '70+']

# Create age groups using pd.cut()
data['AgeGroup'] = pd.cut(data['Age'], bins=age_group_boundaries, labels=age_group_labels, right=False)
data['AgeGroup']

0       41-55
1       41-55
2       41-55
3       26-40
4       41-55
        ...  
9995    26-40
9996    26-40
9997    26-40
9998    41-55
9999    26-40
Name: AgeGroup, Length: 10000, dtype: category
Categories (5, object): ['18-25' < '26-40' < '41-55' < '56-70' < '70+']

In [48]:
mean_credit_score_agegroup = data.groupby("AgeGroup")["CreditScore"].mean()
mean_credit_score_agegroup

  mean_credit_score_agegroup = data.groupby("AgeGroup")["CreditScore"].mean()


AgeGroup
18-25    653.973742
26-40    651.306510
41-55    648.206963
56-70    648.967168
70+      667.324503
Name: CreditScore, dtype: float64

In [49]:
mean_salary_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].mean()
mean_salary_agegroup

  mean_salary_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].mean()


AgeGroup
18-25    103206.077527
26-40     98901.410626
41-55    102995.410469
56-70     95367.217182
70+       96823.540927
Name: EstimatedSalary, dtype: float64

In [54]:
qmin_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0)
q1_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0.25)
q2_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0.5)
q3_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0.75)
qmax_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=1)

print(qmin_per_agegroup,
      q2_per_agegroup,
      q3_per_agegroup,
      qmax_per_agegroup,
      sep = "\n\n")

AgeGroup
18-25    961.57
26-40     90.07
41-55     11.58
56-70     91.75
70+      705.18
Name: EstimatedSalary, dtype: float64

AgeGroup
18-25    106327.85
26-40     98314.77
41-55    103713.93
56-70     93146.11
70+       97893.40
Name: EstimatedSalary, dtype: float64

AgeGroup
18-25    155782.8900
26-40    147667.0625
41-55    151810.2950
56-70    140606.1050
70+      146097.4300
Name: EstimatedSalary, dtype: float64

AgeGroup
18-25    198830.98
26-40    199953.33
41-55    199992.48
56-70    199493.38
70+      198446.91
Name: EstimatedSalary, dtype: float64


  qmin_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0)
  q1_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0.25)
  q2_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0.5)
  q3_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=0.75)
  qmax_per_agegroup = data.groupby("AgeGroup")["EstimatedSalary"].quantile(q=1)


In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   RowNumber        10000 non-null  int64   
 1   CustomerId       10000 non-null  int64   
 2   Surname          10000 non-null  object  
 3   CreditScore      10000 non-null  int64   
 4   Geography        10000 non-null  object  
 5   Gender           10000 non-null  object  
 6   Age              10000 non-null  int64   
 7   Tenure           10000 non-null  int64   
 8   Balance          10000 non-null  float64 
 9   NumOfProducts    10000 non-null  int64   
 10  HasCrCard        10000 non-null  int64   
 11  IsActiveMember   10000 non-null  int64   
 12  EstimatedSalary  10000 non-null  float64 
 13  Exited           10000 non-null  int64   
 14  AgeGroup         10000 non-null  category
dtypes: category(1), float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [63]:
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder
le = LabelEncoder()

# Perform Label Encoding on the 'Geography' column
data['Geography_Id'] = le.fit_transform(data['Geography'])
data['GenderId'] = le.fit_transform(data['Gender'])
data.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,Exited,AgeGroup,France,Germany,Spain,Label_Geography,Female,Male,Geography_Id,GenderId
0,1,15634602,Martinez,619,France,Female,42,2,0.0,1,...,1,41-55,1,0,0,0,1,0,0,0
1,2,15647311,Butler,608,Spain,Female,41,1,83807.86,1,...,0,41-55,0,0,1,2,1,0,2,0
2,3,15619304,Ramirez,502,France,Female,42,8,159660.8,3,...,1,41-55,1,0,0,0,1,0,0,0
3,4,15701354,Bean,699,France,Female,39,1,0.0,2,...,0,26-40,1,0,0,0,1,0,0,0
4,5,15737888,Turner,850,Spain,Female,43,2,125510.82,1,...,0,41-55,0,0,1,2,1,0,2,0


In [65]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


# Select only the numerical columns (excluding the target variable 'Exited')
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.drop('Exited')

# Separate the target variable ('Exited') and the feature variables
X = data[numerical_cols]
y = data['Exited']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize a RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training set
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf.predict(X_test)

# Evaluate the model using a confusion matrix and a classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[2342   74]
 [ 330  254]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      2416
           1       0.77      0.43      0.56       584

    accuracy                           0.87      3000
   macro avg       0.83      0.70      0.74      3000
weighted avg       0.86      0.87      0.85      3000

