# Import Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


# Load dataset


In [None]:

file_path = "customer_churn_data.csv"
df = pd.read_csv(file_path)



# Display basic info

In [None]:

display(df.info())
display(df.head())



# Check for missing values and handling them

In [None]:

print("Missing Values:")
print(df.isnull().sum())

from sklearn.impute import SimpleImputer

# Numerical columns

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
imputer_num = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

# Categorical columns

categorical_cols = df.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

# Check again for missing values
print("Missing Values after handling:")
print(df.isnull().sum())




# Drop irrelevant columns

In [None]:

df.drop(columns=["CustomerID"], inplace=True, errors='ignore')


# Convert target variable to binary

In [None]:

df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


# Encode categorical features

In [None]:

df = pd.get_dummies(df, drop_first=True)



# Exploratory Data Analysis (EDA)

In [None]:

plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df)
plt.title("Churn Distribution")
plt.show()

plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()



# Define features and target variable

In [None]:

X = df.drop(columns=["Churn"], errors='ignore')
y = df["Churn"]

# Train-test split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Feature Scaling

In [None]:

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



# Logistic Regression

In [None]:

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
log_reg_acc = accuracy_score(y_test, y_pred_log)
print("Logistic Regression Performance:")
print(f"Accuracy: {log_reg_acc:.2f}")
print(f"Precision: {precision_score(y_test, y_pred_log):.2f}")
print(f"Recall: {recall_score(y_test, y_pred_log):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred_log):.2f}\n")
print(classification_report(y_test, y_pred_log))



# Confusion Matrix for Logistic Regression

In [None]:

plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_log), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()



# Random Forest

In [None]:

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)
print("Random Forest Performance:")
print(f"Accuracy: {rf_acc:.2f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.2f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf):.2f}\n")
print(classification_report(y_test, y_pred_rf))



# Confusion Matrix for Random Forest

In [None]:

plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()



# XGBoost

In [None]:

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Performance:")
print(f"Accuracy: {xgb_acc:.2f}")
print(f"Precision: {precision_score(y_test, y_pred_xgb):.2f}")
print(f"Recall: {recall_score(y_test, y_pred_xgb):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred_xgb):.2f}\n")
print(classification_report(y_test, y_pred_xgb))


# Confusion Matrix for XGBoost

In [None]:

plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Feature Importance Visualization

In [None]:

feature_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_importances.index)
plt.title("Feature Importance - Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()


# Model Comparison

In [None]:

print("\nModel Performance Summary:")
print(f"Logistic Regression Accuracy: {log_reg_acc:.2f}")
print(f"Random Forest Accuracy: {rf_acc:.2f}")
print(f"XGBoost Accuracy: {xgb_acc:.2f}")

best_model = max([(log_reg_acc, "Logistic Regression"), (rf_acc, "Random Forest"), (xgb_acc, "XGBoost")])
print(f"\nThe best performing model is {best_model[1]} with an accuracy of {best_model[0]:.2f}")
