**Importing the prerequisites**

In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import RandomizedSearchCV

**Loading encoded dataset**

In [3]:
df = pd.read_csv('/content/Encoded_Customer_Churn.csv')
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50,0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90,0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45,0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60,1


**Splitting data into training and testing sets**

In [6]:
#splitting target and features
x = df.drop('Churn',axis=1)
y = df['Churn']

In [7]:
#train test split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
#understanding imbalance
print(y_train.value_counts())
print(y_test.value_counts())

Churn
0    4138
1    1496
Name: count, dtype: int64
Churn
0    1036
1     373
Name: count, dtype: int64


**SMOTE** techinque

as we know there is imbalance in target column

In [11]:
smote = SMOTE(random_state=42)
x_train_smote,y_train_smote = smote.fit_resample(x_train,y_train)


In [14]:
# after smote
print(y_train_smote.value_counts())



Churn
0    4138
1    4138
Name: count, dtype: int64


**Model training**

In [17]:
models = {

    "DecisionTree" : DecisionTreeClassifier(random_state=42),
    "RandomForest" : RandomForestClassifier(random_state=42),
    "XGBoost" : XGBClassifier(random_state=42)
}

In [20]:
#dictionary to store cross validation score
cv_scores = {}

#perform 5 fold cross validation for all models
for model_name,model in models.items():
  print(f"Training {model_name} with default parameters")
  scores= cross_val_score(model,x_train_smote,y_train_smote,cv=5,scoring="accuracy")
  cv_scores[model_name] = scores
  print(f"Cross validation scores for {model_name} : {np.mean(scores):.2f}")
  print("-"*70)

Training DecisionTree with default parameters
Cross validation scores for DecisionTree : 0.78
----------------------------------------------------------------------
Training RandomForest with default parameters
Cross validation scores for RandomForest : 0.84
----------------------------------------------------------------------
Training XGBoost with default parameters
Cross validation scores for XGBoost : 0.83
----------------------------------------------------------------------


In [24]:
print('--- Model Evaluation ---')
predictions = {}

for model_name, model in models.items():
    print(f"Training {model_name} model...")
    model.fit(x_train_smote, y_train_smote)
    model_predictions = model.predict(x_test)
    predictions[model_name] = model_predictions
    print(f"{model_name} training complete and predictions made.\n")

# Evaluate model performance
for model_name, model_predictions in predictions.items():
    print(f"--- {model_name} Performance ---")
    accuracy = accuracy_score(y_test, model_predictions)
    conf_matrix = confusion_matrix(y_test, model_predictions)
    class_report = classification_report(y_test, model_predictions)

    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("\n")

--- Model Evaluation ---
Training DecisionTree model...
DecisionTree training complete and predictions made.

Training RandomForest model...
RandomForest training complete and predictions made.

Training XGBoost model...
XGBoost training complete and predictions made.

--- DecisionTree Performance ---
Accuracy: 0.73
Confusion Matrix:
[[824 212]
 [166 207]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      1036
           1       0.49      0.55      0.52       373

    accuracy                           0.73      1409
   macro avg       0.66      0.68      0.67      1409
weighted avg       0.74      0.73      0.74      1409



--- RandomForest Performance ---
Accuracy: 0.78
Confusion Matrix:
[[878 158]
 [154 219]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1036
           1       0.58      0.59      0.58       373

    accuracy 

In [26]:
param_dist_dt = {
    'max_depth': list(range(3, 21)),
    'min_samples_split': list(range(2, 11)),
    'min_samples_leaf': list(range(1, 6))
}

print("Hyperparameter distribution for Decision Tree:")
print(param_dist_dt)

Hyperparameter distribution for Decision Tree:
{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5]}


## Define Hyperparameter Distributions for Random Forest

Define the hyperparameter search space for the Random Forest Classifier, including parameters like `n_estimators`, `max_depth`, `min_samples_split`, and `min_samples_leaf`.


In [27]:
param_dist_rf = {
    'n_estimators': list(range(100, 501, 50)),
    'max_depth': list(range(5, 21)),
    'min_samples_split': list(range(2, 11)),
    'min_samples_leaf': list(range(1, 6))
}

print("Hyperparameter distribution for Random Forest:")
print(param_dist_rf)

Hyperparameter distribution for Random Forest:
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5]}


## Define Hyperparameter Distributions for XGBoost

Define the hyperparameter search space for the XGBoost Classifier, including parameters like `n_estimators`, `max_depth`, `learning_rate`, and `subsample`.


In [28]:
param_dist_xgb = {
    'n_estimators': list(range(100, 501, 50)),
    'max_depth': list(range(3, 11)),
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
}

print("Hyperparameter distribution for XGBoost:")
print(param_dist_xgb)

Hyperparameter distribution for XGBoost:
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]}


## Perform Random Search for Decision Tree

Execute Random Search Cross-Validation for the Decision Tree Classifier using the defined hyperparameter distributions, `x_train_smote`, `y_train_smote`, and an appropriate scoring metric (e.g., 'accuracy').


In [29]:
# DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=42)

#  RandomizedSearchCV for Decision Tree
random_search_dt = RandomizedSearchCV(
    estimator=dt_classifier,
    param_distributions=param_dist_dt,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Fit RandomizedSearchCV to the SMOTE-resampled training data
print("Performing Random Search for Decision Tree...")
random_search_dt.fit(x_train_smote, y_train_smote)
print("Random Search for Decision Tree complete.")

# Print the best parameters and best score
print("Best parameters for Decision Tree:", random_search_dt.best_params_)
print("Best cross-validation accuracy for Decision Tree:", random_search_dt.best_score_)
print("-"*70)

Performing Random Search for Decision Tree...
Random Search for Decision Tree complete.
Best parameters for Decision Tree: {'min_samples_split': 3, 'min_samples_leaf': 1, 'max_depth': 11}
Best cross-validation accuracy for Decision Tree: 0.8049879591926091


## Perform Random Search for Random Forest

Execute Random Search Cross-Validation for the Random Forest Classifier using the defined hyperparameter distributions, `x_train_smote`, `y_train_smote`, and an appropriate scoring metric (e.g., 'accuracy').


In [30]:
# RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# RandomizedSearchCV for Random Forest
random_search_rf = RandomizedSearchCV(
    estimator=rf_classifier,
    param_distributions=param_dist_rf,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Fit RandomizedSearchCV to the SMOTE-resampled training data
print("Performing Random Search for Random Forest...")
random_search_rf.fit(x_train_smote, y_train_smote)
print("Random Search for Random Forest complete.")

# Print the best parameters and best score
print("Best parameters for Random Forest:", random_search_rf.best_params_)
print("Best cross-validation accuracy for Random Forest:", random_search_rf.best_score_)

Performing Random Search for Random Forest...
Random Search for Random Forest complete.
Best parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 18}
Best cross-validation accuracy for Random Forest: 0.8413615599048411


## Perform Random Search for XGBoost

Execute Random Search Cross-Validation for the XGBoost Classifier using the defined hyperparameter distributions, `x_train_smote`, `y_train_smote`, and an appropriate scoring metric (e.g., 'accuracy').



In [31]:
#  XGBClassifier
xgb_classifier = XGBClassifier(random_state=42)

#  RandomizedSearchCV for XGBoost
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_classifier,
    param_distributions=param_dist_xgb,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Fit RandomizedSearchCV to the SMOTE-resampled training data
print("Performing Random Search for XGBoost...")
random_search_xgb.fit(x_train_smote, y_train_smote)
print("Random Search for XGBoost complete.")

# Print the best parameters and best score
print("Best parameters for XGBoost:", random_search_xgb.best_params_)
print("Best cross-validation accuracy for XGBoost:", random_search_xgb.best_score_)

Performing Random Search for XGBoost...
Random Search for XGBoost complete.
Best parameters for XGBoost: {'subsample': 0.7, 'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.05}
Best cross-validation accuracy for XGBoost: 0.8408803654567478


## Evaluate Best Models from Random Search

Retrieve the best estimator from each Random Search, then evaluate its performance on the `x_test` and `y_test` datasets. This evaluation will include accuracy, confusion matrix, and a classification report for each optimized model.


In [39]:
# Create a dictionary to store the best estimators
best_models = {
    'DecisionTree': random_search_dt.best_estimator_,
    'RandomForest': random_search_rf.best_estimator_,
    'XGBoost': random_search_xgb.best_estimator_
}

print("\n--- Evaluation of Best Models from Random Search ---")

# Iterate through each best model and evaluate its performance
for model_name, best_model in best_models.items():
    print(f"\n--- {model_name} (Optimized) Performance ---")

    # Make predictions on the test set
    y_pred = best_model.predict(x_test)
    y_pred_proba = best_model.predict_proba(x_test)[:, 1] # Probability of the positive class (churn=1)

    # Calculate and print evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("Sample Predicted Probabilities (for Churn=1):")
    print(y_pred_proba[:5]) # Display first 5 probabilities
    print("\n")


--- Evaluation of Best Models from Random Search ---

--- DecisionTree (Optimized) Performance ---
Accuracy: 0.74
Confusion Matrix:
[[829 207]
 [155 218]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      1036
           1       0.51      0.58      0.55       373

    accuracy                           0.74      1409
   macro avg       0.68      0.69      0.68      1409
weighted avg       0.76      0.74      0.75      1409

Sample Predicted Probabilities (for Churn=1):
[1.         0.         0.         0.97679325 0.5       ]



--- RandomForest (Optimized) Performance ---
Accuracy: 0.77
Confusion Matrix:
[[864 172]
 [150 223]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1036
           1       0.56      0.60      0.58       373

    accuracy                           0.77      1409
   macro avg       0.71      0.72      0.71   

**Choosing model**

Save the best performing model (XGBoost) using `pickle` for future use, and then demonstrate how to load this saved model and use it to make predictions on a new sample input.

In [40]:
import pickle

# Access the best performing XGBoost model
best_xgb_model = best_models['XGBoost']

# Define the file path for saving the model
model_filepath = '/content/xgboost_model.pkl'

# Save the model using pickle
with open(model_filepath, 'wb') as file:
    pickle.dump(best_xgb_model, file)

print(f"Best XGBoost model saved to {model_filepath}")

Best XGBoost model saved to /content/xgboost_model.pkl


## Load the Saved Model and Make Predictions

Load the previously saved XGBoost model from the `xgboost_model.pkl` file and use it to make predictions on a new, hypothetical sample input. This demonstrates the model's functionality after persistence.

In [43]:
import pickle

# Define the file path for the saved model
model_filepath = '/content/xgboost_model.pkl'

# Load the model from the file
with open(model_filepath, 'rb') as file:
    loaded_xgb_model = pickle.load(file)

print(f"XGBoost model loaded from {model_filepath}")


# For demonstration, we'll use the first row of the test set as a new sample

new_sample_input = x_test.iloc[0:1]

print("\nNew sample input for prediction:")
print(new_sample_input)

# Make a prediction using the loaded model
prediction = loaded_xgb_model.predict(new_sample_input)
prediction_proba = loaded_xgb_model.predict_proba(new_sample_input)

predicted_class = prediction[0]
probability_of_predicted_class = prediction_proba[0][predicted_class]

print(f"\nPrediction for the new sample input: {predicted_class}")
print(f"Probability of the predicted class ({predicted_class}): {probability_of_predicted_class:.4f}")

print("-"*70)

XGBoost model loaded from /content/xgboost_model.pkl

New sample input for prediction:
     gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
185       0              0        1           0       1             0   

     MultipleLines  InternetService  OnlineSecurity  OnlineBackup  \
185              1                0               0             0   

     DeviceProtection  TechSupport  StreamingTV  StreamingMovies  Contract  \
185                 0            0            0                0         0   

     PaperlessBilling  PaymentMethod  MonthlyCharges  TotalCharges  
185                 1              2            24.8          24.8  

Prediction for the new sample input: 1
Probability of the predicted class (1): 0.9409
----------------------------------------------------------------------


**performs well and hence project comes to an end with developed prediction system**

### **Final summary**

Among the models evaluated after hyperparameter tuning,

*   XGBoost performed the best on the test data with an accuracy of **0.79**.
*    It was followed by Random Forest with an accuracy of **0.77**.

*   the Decision Tree model with an accuracy of **0.74**.




  