In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/content/Churn_Modelling.csv')

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.describe()

In [None]:
df['Geography'].value_counts()

In [None]:
df['Gender'].value_counts()

In [None]:
df['IsActiveMember'].value_counts()

In [None]:
df['Exited'].value_counts()

In [None]:
df['Geography']=df['Geography'].map({'France': 1,'Germany':2,'Spain':3})

In [None]:
df['Geography'].value_counts()

In [None]:
df['Gender']=df['Gender'].map({'Male':0,'Female':1})

In [None]:
df['Gender'].value_counts()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
X = df.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)

In [None]:
X

In [None]:
y = df['Exited']

In [None]:
y

In [None]:
sns.countplot(data=df,x='Exited')
plt.title('Customer Churn Count')
plt.ylabel('Number of Customers')
plt.xlabel('Exited (0 = No, 1 = Yes)')

In [None]:
sns.countplot(data=df, x='Gender', hue='Exited')


In [None]:
sns.countplot(data=df, x='Geography', hue='Exited')


In [None]:
corr_matrix=X.corr()
corr_matrix

In [None]:
sns.heatmap(corr_matrix, annot=True,fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap (X only)")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.3)

In [None]:
X_train.shape

In [None]:
scaler=StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
X_test=scaler.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
smote = SMOTE(random_state=42)

In [None]:
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
print(X_resampled)

In [None]:
y_resampled.value_counts()

In [None]:
y.value_counts()

In [None]:
sns.countplot(x=y_resampled)
plt.title("Class Distribution After SMOTE")
plt.xlabel("Exited")
plt.ylabel("Count")
plt.show()

In [None]:
sns.countplot(x=y)
plt.title("Class Distribution before SMOTE")
plt.xlabel("Exited")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVC": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Random Forest": RandomForestClassifier()
}

for i, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"{model.__class__.__name__}:")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_proba))

    #plot ROC
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{i} (AUC = {roc_auc_score(y_test, y_proba):.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.title("ROC Curve Comparison")
plt.show()


In [None]:
from collections import Counter
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_resampled))

In [None]:
import numpy as np

importances = model.feature_importances_
features = X.columns

In [None]:
feat_imp_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10,6))
plt.barh(feat_imp_df['Feature'], feat_imp_df['Importance'])
plt.gca().invert_yaxis()
plt.title("Feature Importance")
plt.show()

**Using machine learning, we identified the key drivers of customer churn for a bank. The most influential features were age, salary, credit score, and account balance. These insights allow the bank to design targeted retention strategies for high-risk customers, especially older clients with high balances and fewer products. This enables smarter marketing and can reduce revenue loss by proactively retaining valuable customers.**

In [None]:
#import pickle

# Define the filename
#filename = 'model.pkl'

# Save the model using pickle
#with open(filename, 'wb') as file:
    #pickle.dump(model, file)

#print(f"Model saved to {filename}")