# Extract

In [839]:
import pandas as pd

In [840]:
# https://www.kaggle.com/datasets/radheshyamkollipara/bank-customer-churn
df = pd.read_csv(r"D:\Projects\Customer Churn Prediction\Customer-Churn-Records.csv")

In [None]:
df.head()

# Cleaning 

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.head()

In [846]:
#We don't need of RowNumber, Surname, CustomerID Columns
df.drop(columns=["RowNumber","CustomerId","Surname"], inplace=True)

In [None]:
df.columns

In [None]:
df['CreditScore'].max()

In [849]:
# convert int64 to int32
df['CreditScore']=df['CreditScore'].astype('Int16')

In [None]:
df['Age'].max()

In [851]:
# convert int64 to int16
df['Age']=df['Age'].astype('int16')

In [None]:
df.max()

In [853]:
df = df.astype({
    'Tenure':'int16',
    'NumOfProducts':'int8',
    'HasCrCard':'int8',
    'IsActiveMember':'int8',
    'Exited':'int8',
    'Complain':'int8',
    'Satisfaction Score':'int8',
    'Point Earned':'int16',
})

In [None]:
df['Gender'].unique()

In [855]:
df['Gender']=df['Gender'].astype('category')

In [None]:
df['Card Type'].unique()

In [857]:
df['Card Type']=df['Card Type'].astype('category')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df[df.duplicated(subset=['Gender','Age'])]

In [None]:
print(df['Age'].min())
print(df['NumOfProducts'].min())
print(df['Point Earned'].min())

# 3. Exploratory Data Analysis (EDA)

In [862]:
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Bar plots for categorical columns
categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Complain', 'Card Type']

for col in categorical_cols:
    plt.figure(figsize=(5, 3))
    sb.countplot(data=df, x=col)
    plt.title(f'Count of {col}')
    plt.show()


In [None]:
df['Age'].describe()

In [None]:
df['Age'].plot(kind='hist')

In [None]:
df['Age'].plot(kind='kde')

In [None]:
df['Age'].skew()

In [None]:
df['Age'].plot(kind='box')

In [None]:
df[df['Age']>65]['Age'].count()

In [None]:
df[df['Age']>65]['Age'].count()/len(df['Age'])

In [None]:
(df[df['Age']>65]['Age'].count()/len(df['Age']))*100

In [None]:
df[df['Age']>65]['Age'].plot(kind='hist')

In [875]:
Q1 = df['Age'].quantile(.25)
Q3 = df['Age'].quantile(.75)
IQR = Q3 - Q1

In [None]:
print(Q1)
print(Q3)
print(IQR)

In [877]:
lower_limit = Q1-1.5*IQR
upper_limit = Q3+1.5*IQR

In [None]:
print(lower_limit)
print(upper_limit)

In [879]:
df=df[df['Age']<=upper_limit]

In [None]:
df['Age'].plot(kind='box')

In [None]:
df['Age'].describe()

In [None]:
df.head()

In [None]:
df['CreditScore'].describe()

In [None]:
df['CreditScore'].plot(kind='kde')

In [None]:
df['CreditScore'].plot(kind='box')

In [886]:
Q1 = df['CreditScore'].quantile(.25)
Q3 = df['CreditScore'].quantile(.75)
IQR = Q3 - Q1

In [None]:
print(Q1)
print(Q3)
print(IQR)

In [None]:
lower_limit = Q1-1.5*IQR
upper_limit = Q3+1.5*IQR
print(lower_limit)
print(upper_limit)

In [None]:
df.count()

In [890]:
df = df[(df['CreditScore']>=lower_limit) & (df['CreditScore']<=upper_limit)]

In [None]:
df['CreditScore'].plot(kind='box')

In [None]:
df['Tenure'].describe()

In [None]:
df['Tenure'].plot(kind='hist')

In [None]:
df['Tenure'].plot(kind='kde')

In [None]:
df['Tenure'].plot(kind='box')

In [None]:
df.head()

In [None]:
df['Balance'].describe()

In [None]:
df['Balance'].plot(kind='hist')

In [None]:
df['Balance'].plot(kind='kde')

In [None]:
df['Balance'].plot(kind='box')

In [None]:
df['EstimatedSalary'].describe

In [None]:
df['EstimatedSalary'].plot(kind='hist')

In [None]:
df['EstimatedSalary'].plot(kind='kde')

In [None]:
df['EstimatedSalary'].plot(kind='box')

In [None]:
df['Point Earned'].describe()

In [None]:
df['Point Earned'].plot(kind='hist')

In [None]:
df['Point Earned'].plot(kind='kde')

In [None]:

df['Point Earned'].plot(kind='box')

In [None]:
df['Geography'].unique()

In [None]:
sb.countplot(x='Geography', hue='Exited', data = df)
plt.title('Churn by Geography')
plt.show()

In [None]:
sb.countplot(x='Gender', hue='Exited', data = df)
plt.title('Churn by Gender')
plt.show()

In [None]:
df.head()

In [None]:
sb.countplot(x='NumOfProducts', hue='Exited', data = df)
plt.title('Churn by Num of Products')
plt.show()

In [None]:
sb.countplot(x='HasCrCard', hue='Exited', data = df)
plt.title('Churn by Num of HasCrCard')
plt.show()

In [None]:
sb.countplot(x='IsActiveMember', hue='Exited', data = df)
plt.title('Churn by Num of IsActiveMember')
plt.show()

In [None]:
sb.countplot(x='Complain', hue='Exited', data = df)
plt.title('Churn by Num of Complain')
plt.show()

In [None]:
sb.countplot(x='Satisfaction Score', hue='Exited', data = df)
plt.title('Churn by Num of Satisfaction Score')
plt.show()

In [None]:
sb.countplot(x='Card Type', hue='Exited', data = df)
plt.title('Churn by Num of Card Type')
plt.show()

In [None]:
pd.crosstab(df['Exited'],df['Gender'],normalize='columns')*100

In [None]:
pd.crosstab(df['Exited'],df['Geography'],normalize='columns')*100

In [None]:
pd.crosstab(df['Exited'],df['NumOfProducts'],normalize='columns')*100

In [None]:
pd.crosstab(df['Exited'],df['HasCrCard'],normalize='columns')*100

In [None]:
pd.crosstab(df['Exited'],df['IsActiveMember'],normalize='columns')*100

In [None]:
pd.crosstab(df['Exited'],df['Complain'],normalize='columns')*100

In [None]:
pd.crosstab(df['Exited'],df['Satisfaction Score'],normalize='columns')*100

In [None]:
pd.crosstab(df['Exited'],df['Card Type'],normalize='columns')*100

In [None]:
# survived and age

df[df['Exited'] == 0]['Age'].plot(kind='kde',label='Not Exited')
df[df['Exited'] == 1]['Age'].plot(kind='kde',label='Exited')

plt.legend()
plt.show()

In [None]:
df.head()

## Feature  Engineering

In [None]:
numerical_df = df.select_dtypes(include=['float64', 'int16', 'int8'])

# Compute correlation matrix
corr_matrix = numerical_df.corr()

# Display the correlation matrix as a heatmap
plt.figure(figsize=(10, 6))
sb.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [930]:
df.drop(columns=['CreditScore', 'Tenure', 'HasCrCard', 'EstimatedSalary','Satisfaction Score', 'Card Type','Point Earned'],inplace=True)

In [None]:
df.columns

In [None]:
numerical_df = df.select_dtypes(include=['float64', 'int16', 'int8'])

# Compute correlation matrix
corr_matrix = numerical_df.corr()

# Display the correlation matrix as a heatmap
plt.figure(figsize=(10, 6))
sb.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [933]:
import numpy as np
from scipy import stats

# Selecting only numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int16', 'int8'])

# Calculate z-scores for these numerical columns
z_scores = np.abs(stats.zscore(numerical_cols))

# Filter rows where all z-scores are below the threshold (3)
df_clean = df[(z_scores < 3).all(axis=1)]


In [934]:
# # Create age groups
# df['AgeGroup'] = pd.cut(df['Age'], bins=[18, 30, 40, 50, 60, 80], labels=['18-30', '30-40', '40-50', '50-60', '60+'])
# sb.countplot(data=df, x='AgeGroup', hue='Exited')
# plt.title('Age Group vs Exited')
# plt.show()


In [None]:
# Pairplot for selected columns
sb.pairplot(df[['Age', 'Balance', 'Exited']], hue='Exited')
plt.show()


In [None]:
df.columns

In [None]:
df.head()

# Preprocessing

In [None]:
# from sklearn.preprocessing import LabelEncoder
# # One-Hot Encoding for Geography and AgeGroup
# df = pd.get_dummies(df, columns=['Geography', 'AgeGroup'], drop_first=True)

df.head()
print(df['Geography'].unique())


In [939]:
# Label Encoding for Gender (binary)
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender']) # 0 mean Female and 1 mean Male
df['Geography'] = le.fit_transform(df['Geography']) # 0 mean France and 1 mean Germany and 2 mean Spain

In [None]:
df.head()

In [941]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df[['Age','Balance','NumOfProducts']] = scaler.fit_transform(df[['Age','Balance','NumOfProducts']])

In [None]:
df[['Age','Balance','NumOfProducts']]

# Model Selection and Development

In [943]:
from sklearn.model_selection import train_test_split

In [944]:
X = df.drop(columns=['Exited'])
y = df['Exited']

In [None]:
X

In [None]:
y

In [947]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42)

In [948]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#Logitstic Regression
lg  = LogisticRegression()
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)
accuracyl=accuracy_score(y_test,y_pred)
reportl = classification_report(y_test,y_pred)

# Output results
print(f"Accuracy Score of Logistic Regression: {accuracyl * 100:.2f}%")
print("Classification Report of Logistic Regression:\n", reportl)

In [None]:
#Random Forest Classifier
rfc  = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
accuracyr=accuracy_score(y_test,y_pred)
reportr = classification_report(y_test,y_pred)

# Output results 
print(f"Accuracy Score of Random Forest Classifier: {accuracyr * 100:.2f}%")
print("Classification Report of Random Forest Classifier:\n", reportr)

In [None]:
# K-Nearest Neighbors (KNN)
knn  = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
accuracyk=accuracy_score(y_test,y_pred)
reportk = classification_report(y_test,y_pred)

# Output results
print(f"Accuracy Score of KNN: {accuracyk * 100:.2f}%")
print("Classification Report of KNN:\n", reportk)

In [None]:
# Support Vector Machine (SVM)
svm  = SVC()
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
accuracys=accuracy_score(y_test,y_pred)
reports = classification_report(y_test,y_pred)

# Output results
print(f"Accuracy Score of SVM: {accuracys * 100:.2f}%")
print("Classification Report of SVM:\n", reports)

In [None]:
# Naive Bayes

gnb  = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)
accuracyg=accuracy_score(y_test,y_pred)
reportg = classification_report(y_test,y_pred)

# Output results
print(f"Accuracy Score of Naive Bayes Classifier: {accuracyg * 100:.2f}%")
print("Classification Report of Naive Bayes Classifier:\n", reportg)

In [None]:
# Decision Tree

dtc  = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
accuracyd=accuracy_score(y_test,y_pred)
reportd = classification_report(y_test,y_pred)

# Output results
print(f"Accuracy Score of Decision Tree Classifier: {accuracyd * 100:.2f}%")
print("Classification Report of Decision Tree Classifier:\n", reportd)

In [None]:
# Gradient Boosting Classifier 

gbc  = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
y_pred = gbc.predict(X_test)
accuracyg=accuracy_score(y_test,y_pred)
reportg = classification_report(y_test,y_pred)

# Output results
print(f"Accuracy Score of GradientBoostingClassifier: {accuracyg * 100:.2f}%")
print("Classification Report of GradientBoostingClassifier:\n", reportg)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters you want to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize the GridSearchCV with Gradient Boosting Classifier
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit the model with the best parameters
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Best model
best_gb_model = grid_search.best_estimator_

# Make predictions with the tuned model
y_pred = best_gb_model.predict(X_test)

# Evaluate the tuned model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy Score of Tuned Gradient Boosting Classifier: {accuracy * 100:.2f}%")
print("Classification Report of Tuned Gradient Boosting Classifier:\n", report)


In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(best_gb_model, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {cv_scores.mean() * 100:.2f}%")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature importances
importances = best_gb_model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importance - Gradient Boosting")
plt.bar(range(X.shape[1]), importances[indices], align='center')
plt.xticks(range(X.shape[1]), [X.columns[i] for i in indices], rotation=90)
plt.tight_layout()
plt.show()


In [965]:
import pickle
from sklearn.ensemble import GradientBoostingClassifier

# Example: Retrain and save your model (replace with your actual training code)
model = GradientBoostingClassifier()
model.fit(X_train, y_train)  # Ensure X_train and y_train are defined

# Save the trained model
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)
