### Import Libraries

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib


### Load the Preprocessed Dataset

In [2]:


# Load the processed dataset
df = pd.read_csv("C:/project/Telco_Churn_Predictor/notebooks/telco_customer_churn_processed.csv")

# Display the first few rows of the DataFrame
print(df.head())

# Display the DataFrame info
print(df.info())


   SeniorCitizen    tenure  MonthlyCharges  TotalCharges  Charges_per_Month  \
0              0 -1.277445       -1.160323     -0.992611          -1.157889   
1              0  0.066327       -0.259629     -0.172165          -0.305658   
2              0 -1.236724       -0.362660     -0.958066          -0.355305   
3              0  0.514251       -0.746535     -0.193672          -0.791614   
4              0 -1.236724        0.197365     -0.938874           0.365282   

   gender_Male  Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0        False         True           False             False   
1         True        False           False              True   
2         True        False           False              True   
3         True        False           False             False   
4        False        False           False              True   

   MultipleLines_No phone service  ...  Contract_One year  Contract_Two year  \
0                            True  ...              Fa

This initial step ensures all necessary libraries are imported and that the dataset is loaded correctly. The info() method provides insights into the structure and types of features available for modeling.



### Convert Categorical Variables to Numerical

In [3]:
# Convert 'Tenure_Category' to numerical using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['Tenure_Category'], drop_first=True)

# Display the new DataFrame info
print(df_encoded.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 34 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   float64
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   Charges_per_Month                      7043 non-null   float64
 5   gender_Male                            7043 non-null   bool   
 6   Partner_Yes                            7043 non-null   bool   
 7   Dependents_Yes                         7043 non-null   bool   
 8   PhoneService_Yes                       7043 non-null   bool   
 9   MultipleLines_No phone service         7043 non-null   bool   
 10  MultipleLines_Yes                      7043 non-null   bool   
 11  Inte

Converting categorical variables into numerical format prepares the dataset for modeling. Ensure that all relevant variables are now numerical.



### Define Features and Target Variable

In [4]:
# Define features and target variable
X = df_encoded.drop(columns=['Churn_Yes'])  # Exclude target column
y = df_encoded['Churn_Yes']  # Target variable

# Display shapes of features and target
print(f"Features shape: {X.shape}, Target shape: {y.shape}")


Features shape: (7043, 33), Target shape: (7043,)


This step confirms that the features and target variable are separated correctly, with the expected shapes for training and testing.



### Train-Test Split

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shapes of the training and testing sets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (5634, 33), y_train shape: (5634,)
X_test shape: (1409, 33), y_test shape: (1409,)


The dataset is split into training and testing sets, maintaining the same distribution of the target variable, which is critical for evaluating model performance accurately.



### Initialize Models

In [6]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}


A set of models is initialized for systematic evaluation. This will allow for an organized approach to training and testing different algorithms.



### Train and Evaluate Each Model Without Hyperparameter Tuning


In [7]:
# Initialize a dictionary to hold results
results = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Store the results
    results[model_name] = {
        'Confusion Matrix': cm,
        'Classification Report': report,
        'Accuracy': report['accuracy']
    }

    # Print the results
    print(f"Model: {model_name}")
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print(f"Accuracy: {report['accuracy']:.4f}\n")


Model: Logistic Regression
Confusion Matrix:
 [[921 114]
 [166 208]]

Classification Report:
               precision    recall  f1-score   support

       False       0.85      0.89      0.87      1035
        True       0.65      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409

Accuracy: 0.8013

Model: Random Forest
Confusion Matrix:
 [[929 106]
 [188 186]]

Classification Report:
               precision    recall  f1-score   support

       False       0.83      0.90      0.86      1035
        True       0.64      0.50      0.56       374

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409

Accuracy: 0.7913

Model: Support Vector Machine
Confusion Matrix:
 [[940  95]
 [194 180]]

Classification Report:
               precision    reca

Each model is trained and evaluated, allowing us to see initial performance metrics. This provides a baseline to compare against after hyperparameter tuning.



### Hyperparameter Tuning Using Grid Search

In [8]:
# Define parameter grid for Logistic Regression
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Initialize Grid Search for Logistic Regression
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
lr_grid.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for Logistic Regression:", lr_grid.best_params_)
print("Best accuracy for Logistic Regression:", lr_grid.best_score_)


Best parameters for Logistic Regression: {'C': 100, 'solver': 'liblinear'}
Best accuracy for Logistic Regression: 0.8047547600397793


Grid Search identifies the optimal parameters for Logistic Regression, enhancing its performance beyond the baseline

### Random Forest Hyperparameter Tuning

In [9]:
# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize Grid Search for Random Forest
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for Random Forest:", rf_grid.best_params_)
print("Best accuracy for Random Forest:", rf_grid.best_score_)


Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}
Best accuracy for Random Forest: 0.803334746517342


This step optimizes Random Forest parameters, improving its accuracy further, providing a more tailored model for the data.



### Support Vector Machine Hyperparameter Tuning

In [10]:
# Define parameter grid for Support Vector Machine
svm_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Initialize Grid Search for SVM
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
svm_grid.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for Support Vector Machine:", svm_grid.best_params_)
print("Best accuracy for Support Vector Machine:", svm_grid.best_score_)


Best parameters for Support Vector Machine: {'C': 1, 'kernel': 'rbf'}
Best accuracy for Support Vector Machine: 0.8054665004468078


Hyperparameter tuning for SVM allows for greater flexibility and performance enhancement. The model's parameters are fine-tuned to suit the dataset better.



### Decision Tree Hyperparameter Tuning


In [11]:
# Define parameter grid for Decision Tree
dt_param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Grid Search for Decision Tree
dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for Decision Tree:", dt_grid.best_params_)
print("Best accuracy for Decision Tree:", dt_grid.best_score_)


Best parameters for Decision Tree: {'max_depth': 5, 'min_samples_split': 2}
Best accuracy for Decision Tree: 0.7841648791727673


Decision Tree hyperparameters are optimized to prevent overfitting while ensuring that the model retains its ability to learn from the data effectively.



### K-Nearest Neighbors Hyperparameter Tuning


In [12]:
# Define parameter grid for K-Nearest Neighbors
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}

# Initialize Grid Search for KNN
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
knn_grid.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for K-Nearest Neighbors:", knn_grid.best_params_)
print("Best accuracy for K-Nearest Neighbors:", knn_grid.best_score_)


Best parameters for K-Nearest Neighbors: {'n_neighbors': 7, 'weights': 'uniform'}
Best accuracy for K-Nearest Neighbors: 0.7818575542040123


KNN's hyperparameter tuning allows for optimization of the number of neighbors, significantly impacting model performance

### Compare Best Model Performances

In [13]:
# Collect best models and their accuracies
best_models = {
    "Logistic Regression": lr_grid.best_estimator_,
    "Random Forest": rf_grid.best_estimator_,
    "Support Vector Machine": svm_grid.best_estimator_,
    "Decision Tree": dt_grid.best_estimator_,
    "K-Nearest Neighbors": knn_grid.best_estimator_
}

# Initialize a dictionary to hold final results
final_results = {}

for model_name, model in best_models.items():
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Store the results
    final_results[model_name] = {
        'Accuracy': report['accuracy'],
        'Confusion Matrix': confusion_matrix(y_test, y_pred),
        'Classification Report': report
    }

# Display model accuracy comparison
accuracy_df = pd.DataFrame({
    'Model': final_results.keys(),
    'Accuracy': [result['Accuracy'] for result in final_results.values()]
})

print(accuracy_df)


                    Model  Accuracy
0     Logistic Regression  0.799148
1           Random Forest  0.799858
2  Support Vector Machine  0.794890
3           Decision Tree  0.787793
4     K-Nearest Neighbors  0.777147


The final comparison of model performances allows us to identify which algorithm performed the best after hyperparameter tuning. This aids in selecting the most suitable model for deployment.



### Save the Best Model

In [14]:
# Save the best performing model (for example, Random Forest)
best_model = rf_grid.best_estimator_
joblib.dump(best_model, 'best_random_forest_model.pkl')
print("Best model saved successfully.")


Best model saved successfully.


 The best-performing model is saved for future use, ensuring that it can be deployed or used for predictions without needing to retrain.

