In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_csv('/content/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
print(df.columns)

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


In [11]:
df.drop(columns=['CustomerId', 'Surname'], inplace=True)

In [12]:
print(df.isnull().sum())

RowNumber          0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [13]:
# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop('Exited', axis=1)
y = df['Exited']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_preds))


Logistic Regression Accuracy: 0.815
Random Forest Accuracy: 0.867
Gradient Boosting Accuracy: 0.864


In [16]:
from sklearn.metrics import classification_report

lr_report = classification_report(y_test, lr_preds, target_names=['No Churn', 'Churn'])
print("Logistic Regression Report:\n", lr_report)

rf_report = classification_report(y_test, rf_preds, target_names=['No Churn', 'Churn'])
print("Random Forest Report:\n", rf_report)

gb_report = classification_report(y_test, gb_preds, target_names=['No Churn', 'Churn'])
print("Gradient Boosting Report:\n", gb_report)


Logistic Regression Report:
               precision    recall  f1-score   support

    No Churn       0.83      0.97      0.89      1607
       Churn       0.60      0.18      0.27       393

    accuracy                           0.81      2000
   macro avg       0.71      0.57      0.58      2000
weighted avg       0.78      0.81      0.77      2000

Random Forest Report:
               precision    recall  f1-score   support

    No Churn       0.88      0.97      0.92      1607
       Churn       0.77      0.46      0.58       393

    accuracy                           0.87      2000
   macro avg       0.83      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000

Gradient Boosting Report:
               precision    recall  f1-score   support

    No Churn       0.88      0.96      0.92      1607
       Churn       0.75      0.46      0.57       393

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.75      

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)

# Best estimator
best_rf = grid_search.best_estimator_


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Feature importance
feature_importances = best_rf.feature_importances_
indices = np.argsort(feature_importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), feature_importances[indices], align="center")
plt.xticks(range(X.shape[1]), df.columns[indices], rotation=90)
plt.tight_layout()
plt.show()


In [None]:
import joblib

joblib.dump(best_rf, 'best_random_forest_model.pkl')

# To load the model later
best_rf_loaded = joblib.load('best_random_forest_model.pkl')


In [None]:
# Example of model monitoring (you can use real-time data or periodic evaluation)
def monitor_model_performance(model, X_test, y_test):
    predictions = model.predict(X_test)
    report = classification_report(y_test, predictions)
    print(report)

# Periodic evaluation example
monitor_model_performance(best_rf, X_test, y_test)
