**CUSTOMER CHURN ANALYSIS**

In [127]:
import pandas as pd#for dataframes
import numpy as np#for numerical computing
#for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder#label encoding
from sklearn.model_selection import train_test_split#splitting the data
#importing machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [None]:
churn_df=pd.read_csv("Customer-Churn-Records (1).csv")#reading the csv file

In [None]:
churn_df.head()#displaying top 5 rows

In [None]:
churn_df.columns#displaying column names

In [None]:
churn_df.shape#displaying shape

In [None]:
churn_df.dtypes#displaying data types

In [None]:
churn_df.isnull().sum()#checking for null values

In [None]:
churn_df.duplicated().sum()#checking for duplicate values

In [None]:
churn_df.drop(["RowNumber","CustomerId","Surname"],axis=1,inplace=True)#dropping unneccesary columns

------------------------------------------------------------------------------
visualization

In [None]:
churn_df_cat=churn_df[["Geography","Gender","HasCrCard","IsActiveMember","Card Type","Exited"]]#selecting categorical columns

In [None]:
plt.figure(figsize = (12, 10))#plotting pie chart
#plotting pie chart
for i, col in enumerate(churn_df_cat.columns):
  plt.subplot(2, 3, i+1)#It divides the plotting area into a grid of 2 rows and 3 columns.
  x = churn_df[col].value_counts().reset_index()
  plt.pie(x=x['count'], labels=x[col], autopct="%0.1f%%")
  plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
churn_df.Exited.value_counts()

from the above graph we can see that majority of the people havent left the bank<br>
Total number of customer=10000<br>
Total number of customer who left= 2038<br>
Churn=Total number of customer who left/Total number of customer<br>
churn=2038/10000=0.203800<br>
churn percentage=20.38%


From the above pie chart we can see the various categories of people who left the bank and those who havent left the bank and the percentage of each category

In [None]:
plt.figure(figsize = (12, 10))
#for countplot
for i, col in enumerate(churn_df_cat.columns):
  plt.subplot(2, 3, i+1)
  x = churn_df[col].value_counts().reset_index()
  sns.countplot(data=churn_df,x=col,hue="Exited",palette="YlGnBu")
  plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
churn_df_num=churn_df[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']]
plt.figure(figsize = (12, 10))
#for countplot and histogram
for i, col in enumerate(churn_df_num.columns):
  plt.subplot(3, 2, i+1)
  if (col in ['Tenure', 'NumOfProducts']):
    sns.countplot(data=churn_df, x=col)
  else:
    sns.histplot(data=churn_df, x=col, bins=20, kde=True)
  plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(data=churn_df,x="Complain",hue="Satisfaction Score")
plt.title("complain count")

From the plot we can see that the satisfication score of people who left the bank were having an moderately satisifed

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(data=churn_df,x="Age",hue="Exited",palette="YlGnBu",bins=10,kde=True)
plt.title("Which age group people left the bank more and which age group is the majority of the costumer")

from the above graph we can say the majority of the people who left the bank are in the age group of 20-50 and the majority of the people who havent left the bank are in the age group of 40-60

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(data=churn_df,x="Card Type",y="Point Earned",hue="Exited",palette="pastel")
plt.grid()
plt.title("Card type vs point earned")

Box Plot: A box plot shows the distribution of the data, indicating the median, quartiles, and potential outliers for each group.

The box represents the interquartile range (IQR), with the line inside the box showing the median.
The whiskers extend to show the range of the data, excluding outliers.
Outliers may be shown as individual points outside the whiskers.

from the above graph we can see that people who left the bank have earned more points than people who havent left the bank 

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=churn_df,x="Geography",hue="Exited",palette="pastel")
plt.title("Which country has more churn rate")
plt.grid()

from the above plot we can say that Germany has more churn rate than france and spain

In [None]:
cross_tab_agg = pd.crosstab(churn_df['Gender'], churn_df['Exited'])#cross tab
cross_tab_agg.value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(data=churn_df,x="Age",y="CreditScore",hue="Exited",palette="YlGnBu")
plt.title("Credit card score vs age")

from the plot we can see that people who havent left the bank are in the age group of 20-40 and people who have left the bank are in the age group of 40-60 , we can see that the age group of majority of old people have a credit score of 700 and the age group of majority of young people have a credit score of 500

In [None]:
sns.heatmap(data=churn_df_num.corr(),annot=True,linewidths=0.7)

from the above heatmap we can see that instances are correlated to each other within a range of 0.7 to -0.7

------------------------------------------------------------------------------------------------------------------------
data preprocessing

In [None]:
churn_df["Balance"]=churn_df["Balance"].apply(np.int64)#changing data type
churn_df["EstimatedSalary"]=churn_df["EstimatedSalary"].apply(np.int64)#changing data type

In [None]:
le=LabelEncoder()#label encoding
churn_df["Card Type"]=le.fit_transform(churn_df["Card Type"])
churn_df["Geography"]=le.fit_transform(churn_df["Geography"])
churn_df["Gender"]=le.fit_transform(churn_df["Gender"])
churn_df.dtypes

In [None]:
X=churn_df.drop("Exited",axis=1)#dropping target variable
y=churn_df.Exited
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)#data is split into 70 %training and 30% testing
print(f"Training set size: {X_train.shape}, {y_train.shape}")
print(f"Testing set size: {X_test.shape}, {y_test.shape}")

-----------------------------------------------------------------------------------------
model building

In [None]:
#logistic regression
log_reg=LogisticRegression()
log_reg.fit(X_train,y_train)#training
y_pred_log=log_reg.predict(X_test)#predicting
accuracy_score(y_test,y_pred_log)#checking accuracy
print("Classification report",classification_report(y_test,y_pred_log))#classification report
cmf_log=confusion_matrix(y_test,y_pred_log)#confusion matrix
sns.heatmap(cmf_log,annot=True,linewidths=0.7,fmt="d")#heatmap

In [None]:
#random forest
random=RandomForestClassifier(n_estimators=100)
random.fit(X_train,y_train)#training
y_pred_random=random.predict(X_test)#predicting
accuracy_score(y_test,y_pred_random)#checking accuracy
print(classification_report(y_test,y_pred_random))#classification report
cmf_random=confusion_matrix(y_test,y_pred_random)#confusion matrix
sns.heatmap(cmf_random,annot=True,linewidths=0.7,fmt="d")#heatmap

In [None]:
#decision tree
dec=DecisionTreeClassifier(criterion="entropy",max_depth=5)
dec.fit(X_train,y_train)#training
y_pred_dec=dec.predict(X_test)#predicting
accuracy_score(y_test,y_pred_dec)#checking accuracy
print(classification_report(y_test,y_pred_dec))#classification report
cmf_decision=confusion_matrix(y_test,y_pred_dec)#confusion matrix
sns.heatmap(cmf_decision,annot=True,linewidths=0.7,fmt="d")

In [None]:
#calculating roc auc score
log=roc_auc_score(y_test,y_pred_log)
random=roc_auc_score(y_test,y_pred_random)
decision=roc_auc_score(y_test,y_pred_dec)
print("ROC AUC SCORES")
print("\nLogisticRegression: ",log)
print("\nRandomForestClassifier: ",random)
print("\nDecisionTreeClassifier: ",decision)