##customer churn prediction##

**Import libraries**

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import joblib

**Load dataset**

In [8]:
data = pd.read_csv('/content/Churn_Modelling.csv')

**dataset info**

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


**Drop unnecessary data**

In [10]:
data = data.drop(columns=["RowNumber", "CustomerId", "Surname"])

**Data Encoding for categorical data**

1.   Male = 1, Female = 0



In [11]:
label_encoder = LabelEncoder()
data["Gender"] = label_encoder.fit_transform(data["Gender"])
data = pd.get_dummies(data, columns=["Geography"], drop_first=True)

**Check for null values**

In [12]:
data.isnull().sum()

Unnamed: 0,0
CreditScore,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0
HasCrCard,0
IsActiveMember,0
EstimatedSalary,0
Exited,0


**Scale numerical features**

In [13]:
scaler = StandardScaler()
numerical_features = ["CreditScore", "Age", "Balance", "EstimatedSalary"]
data[numerical_features] = scaler.fit_transform(data[numerical_features])

**Separate features (X) and target (y)**

In [14]:
X = data.drop(columns=["Exited"])
y = data["Exited"]

**Train-Test Split**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X_train.shape
X_test.shape

(2000, 11)

**Training with different model**


1.  Logistic Regression
2.  Random Forest
3.  Gradient Boosting



**Logistic Regression**

In [19]:
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

lr_predictions = lr_model.predict(X_test)

**Random Forest**

In [20]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)


rf_predictions = rf_model.predict(X_test)

**Gradient Boosting**

In [21]:
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)

**Model Evaluation**

In [22]:
def evaluate_model(y_test, predictions):
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Precision:", precision_score(y_test, predictions))
    print("Recall:", recall_score(y_test, predictions))
    print("F1 Score:", f1_score(y_test, predictions))
    print("ROC-AUC:", roc_auc_score(y_test, predictions))

# Evaluate Logistic Regression
print("Logistic Regression:")
evaluate_model(y_test, lr_predictions)

# Evaluate Random Forest
print("\nRandom Forest:")
evaluate_model(y_test, rf_predictions)

# Evaluate XGBoost
print("\nXGBoost:")
evaluate_model(y_test, xgb_predictions)

Logistic Regression:
Accuracy: 0.8115
Precision: 0.5563380281690141
Recall: 0.2010178117048346
F1 Score: 0.2953271028037383
ROC-AUC: 0.5809071634753171

Random Forest:
Accuracy: 0.866
Precision: 0.7659574468085106
Recall: 0.4580152671755725
F1 Score: 0.5732484076433121
ROC-AUC: 0.7118950013538099

XGBoost:
Accuracy: 0.864
Precision: 0.7137809187279152
Recall: 0.5139949109414759
F1 Score: 0.5976331360946746
ROC-AUC: 0.7317952152716092


**Hyperparameter Tuning**

In [23]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Grid Search
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", grid_search.best_params_)

# Best Model
best_rf_model = grid_search.best_estimator_

# Predictions and Evaluation
best_rf_predictions = best_rf_model.predict(X_test)
print("\nBest Random Forest:")
evaluate_model(y_test, best_rf_predictions)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}

Best Random Forest:
Accuracy: 0.8645
Precision: 0.7629310344827587
Recall: 0.45038167938931295
F1 Score: 0.5664
ROC-AUC: 0.7080782074606801


**Save the Model**

In [24]:
# Save the best model
joblib.dump(best_rf_model, "best_rf_model.pkl")

# Save preprocessing steps (scaler and encoder)
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

**Use the save model for testing**

In [27]:
# Load the model
loaded_model = joblib.load("best_rf_model.pkl")
loaded_scaler = joblib.load("scaler.pkl")
loaded_label_encoder = joblib.load("label_encoder.pkl")

# Test with new data for prediction
new_data = pd.DataFrame({
    "CreditScore": [600],
    "Gender": [loaded_label_encoder.transform(["Male"])[0]],
    "Age": [40],
    "Tenure": [5],
    "Balance": [100000],
    "NumOfProducts": [2],
    "HasCrCard": [1],
    "IsActiveMember": [1],
    "EstimatedSalary": [50000],
    "Geography": ['France'],
})

new_data = pd.get_dummies(new_data, columns=["Geography"], drop_first=False)

expected_geography_cols = ['Geography_France', 'Geography_Germany', 'Geography_Spain']
for col in expected_geography_cols:
    if col not in new_data.columns:
        new_data[col] = 0


X_train_columns = X_train.columns

new_data = new_data.reindex(columns=X_train_columns, fill_value=0)


#Scaling
new_data[numerical_features] = loaded_scaler.transform(new_data[numerical_features])

# Prediction
new_prediction = loaded_model.predict(new_data)
print("Churn Prediction (1=Churned, 0=Not Churned):", new_prediction[0])


Churn Prediction (1=Churned, 0=Not Churned): 0
