In [None]:
# For data manipulation
import pandas as pd
import numpy as np

# For model training and evaluation
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
import pydotplus
%matplotlib inline

# For Google Colab integration
import os
from google.colab import drive



from google.colab import drive
drive.mount('/content/drive')


#Read the Data and Check the Stats

In [None]:
# import data as dataframe
file_path = '/content/drive/MyDrive/Infor648/Data/churn.csv'
df = pd.read_csv(file_path)

# calling head() method
df.head()

In [None]:
df_summary = df.describe()
df_summary

# Remove Rows with a Missing Value

In [None]:
df = df[["Gender", "Age", "Married", "Number of Dependents", "Tenure in Months", "Monthly Charge", "Total Charges", "Total Refunds" ,"Total Revenue", "Customer Status"]]

In [None]:
display(df.isna().sum()) ##check missing value
#df = df.dropna() ##drop missing value

In [None]:
df = df.dropna() ##drop missing value

In [None]:
display(df.isna().sum()) ##recheck missing value again

#check how many categorical/numeric variables we have

In [None]:
# Numeric Variables
numeric_variables = [col for col in df.columns if df[col].dtype != "object" and col not in "Customer Status"] ##exclude our target variable: customer status
numeric_variables

In [None]:
categorical_variables = [col for col in df.columns if df[col].dtype == "O" and col != "Customer Status"]  ###exclude our target: "Customer Status"
categorical_variables

In [None]:
df[['Gender', 'Married']]

#Select the variables we are interested in

In [None]:
df_sub = df[["Gender", "Age", "Married", "Number of Dependents", "Tenure in Months", "Monthly Charge", "Total Charges", "Total Refunds" ,"Total Revenue", "Customer Status"]]

#Encode our categorical data

In [None]:
##encode categorical data
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_sub['Gender'] = label_encoder.fit_transform(df_sub['Gender'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


#Print out what we encoded for gender
print("Gender Encoding:")
print(mapping)

In [None]:
df_sub['Married'] = label_encoder.fit_transform(df_sub['Married'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


#prints out what we encoded for married
print("Married Encoding:")
print(mapping)

#Now encode our target variable

In [None]:
display(df_sub['Customer Status'].value_counts())
##Our target variable is a categorical variable

In [None]:
####We are only interested in why people stayed and churned
df_sub = df_sub[df_sub['Customer Status'] !='Joined'] # we drop all the new customers

####Encode our target variable
target_label_encoder = LabelEncoder()
df_sub['Customer Status'] = target_label_encoder.fit_transform(df_sub['Customer Status'])


##display the stats after encoding
display(df_sub['Customer Status'].value_counts())
mapping = dict(zip(target_label_encoder.classes_, target_label_encoder.transform(target_label_encoder.classes_)))
print(mapping)

# Define features (X) and target (y)

In [None]:
from sklearn.preprocessing import StandardScaler
X = df_sub.drop('Customer Status', axis=1)  # Drop the target column to get independent variables
y = df_sub['Customer Status']  # Select the target column directly as our y

feature_names = X.columns.tolist()



class_names = target_label_encoder.inverse_transform(np.arange(len(target_label_encoder.classes_)))

##print out the features we selected for predictions and our classification target
print("features:",feature_names)
print("Classes:", class_names)


In [None]:
#check our feature scale
X

# Standardizing the independent variable features to ensure they have the same scale

In [None]:
# Standardize the independent variables
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

#Train our KNN model

In [None]:
###the n we choose is 3 and the distance metric we choose is euclidean
knn_model = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

#check our trained knn
knn_model

#Evaluate our KNN model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a DataFrame for evaluation metrics
evaluation_metrics = pd.DataFrame({
    "Evaluation Metric": ["Train Accuracy", "Test Accuracy", "Recall", "Precision", "F1 Score"],
    "Value": [
        knn_model.score(X_train, y_train),
        accuracy_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ]
})

# Display the DataFrame
evaluation_metrics


In [None]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)


plt.figure(figsize=(8, 6))  # Adjust figure size if needed
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.title('Confusion Matrix for KNN Model')
plt.show()


In [None]:
from yellowbrick.classifier import ClassificationReport

# Create the classification report visualizer for the k-NN model
visualizer = ClassificationReport(knn_model, classes=class_names, support=False, title="KNN Classifier Evaluation")


visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)


# Display the plot
visualizer.show()


# Performing cross-validation on the entire dataset to provide a general evaluation of the model's performance

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score

#our original model n = 3 and distance metric is euclidean
CV_model = KNeighborsClassifier(n_neighbors=3,metric = 'euclidean')

# Cross-validate with 10 folds
y_pred_cross = cross_val_predict(CV_model, X, y, cv=10)

# Compute evaluation metrics for cross-validation
accuracy_cv = accuracy_score(y, y_pred_cross)
recall_cv = recall_score(y, y_pred_cross)
precision_cv = precision_score(y, y_pred_cross)
f1_cv = f1_score(y, y_pred_cross)
matrix_cv = confusion_matrix(y, y_pred_cross)


# Create DataFrame for evaluation metrics with cross-validation
evaluation_metrics_with_cv = pd.DataFrame({
    "Evaluation Metric_CV": ["Accuracy", "Recall", "Precision", "F1 Score"],
    "Value": [
        accuracy_cv,
        recall_cv,
        precision_cv,
        f1_cv
    ]
})



# Display the metrics
print("Performance Metrics With Cross-Validation:")
display(evaluation_metrics_with_cv)
print('\nConfusion Matrix with CV:\n', matrix_cv)

print("\nPerformance Metrics Without Cross-Validation:")
display(evaluation_metrics)
print('\nConfusion Matrix Without CV:\n', conf_matrix)


####Note: k-NN does not provide feature importance because it relies purely on distance calculations between data points.

#### Unlike decision trees, k-NN does not use feature splits or calculate information gain to make predictions.


#Hyperparameter Fine-tuning (choosing the best n)

In [None]:
n_neighbors_range = range(1, 16) #Here is the n range we would like to test; you can make adjustments here
train_accuracies = []
cv_test_accuracies = []

# Loop over different values of n_neighbors
for n in n_neighbors_range:
    hyperFT_model = KNeighborsClassifier(n_neighbors=n,metric = 'euclidean')

    # Fit the model on the training set
    hyperFT_model.fit(X_train, y_train)

    # Calculate training accuracy
    y_pred_train = hyperFT_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_accuracies.append(train_accuracy)

    # Perform cross-validation on the training set and calculate the mean accuracy
    cv_test_accuracy = cross_val_score(hyperFT_model, X_train, y_train, cv=5, scoring='accuracy').mean()
    cv_test_accuracies.append(cv_test_accuracy)

# Plot the performance metrics
plt.figure(figsize=(9, 5))  # Adjust figure size as needed
plt.plot(n_neighbors_range, train_accuracies, label='Train Accuracy', marker='o', color='blue')
plt.plot(n_neighbors_range, cv_test_accuracies, label='Mean Cross-Validated Test Accuracy', marker='o', color='green')

plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.title('Performance of k-NN with Varying Number of Neighbors')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()


# Displaying the best hyperparameter combinations based on mean CV accuracy

In [None]:

from sklearn.model_selection import GridSearchCV



hyperBC_model = KNeighborsClassifier()

# Define the hyperparameters to tune
parameters = {
    'n_neighbors': [8, 11, 13, 15],      # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weight used in prediction
    'metric': ['euclidean'] # Distance metric for the tree
}

# Perform GridSearchCV with cross-validation (e.g., cv=5)
grid_knn = GridSearchCV(hyperBC_model, param_grid=parameters, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_knn.fit(X_train, y_train)

# Create a DataFrame to display the results
result = pd.DataFrame(grid_knn.cv_results_['params'])
result['mean_CV_test_score'] = grid_knn.cv_results_['mean_test_score']
result = result.sort_values(by='mean_CV_test_score', ascending=False)

# Display the sorted DataFrame of hyperparameter combinations and their CV scores
result


#Now let's fine-tune with the best hyperparameters combined

In [None]:
#Train the Tuned k-NN Model (with best hyperparameters)
tuned_knn_model = KNeighborsClassifier(n_neighbors=13, weights='distance', metric='euclidean')
tuned_knn_model.fit(X_train, y_train)
y_pred_tuned = tuned_knn_model.predict(X_test)

tuned_knn_model

# Compare Performance: Original vs Tuned KNN

In [None]:
#Calculate performance metrics for our original model
train_accuracy_original = knn_model.score(X_train, y_train)
test_accuracy_original = accuracy_score(y_test, y_pred)
precision_original = precision_score(y_test, y_pred)
recall_original = recall_score(y_test, y_pred)
f1_original = f1_score(y_test, y_pred)
confusion_matrix_original = confusion_matrix(y_test, y_pred)

# Calculate Performance Metrics for the Tuned k-NN Model
train_accuracy_tuned = tuned_knn_model.score(X_train, y_train)
test_accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)
confusion_matrix_tuned = confusion_matrix(y_test, y_pred_tuned)

# Create a Comparison Table for Evaluation Metrics
comparison_df = pd.DataFrame({
    'Evaluation Metric': ['Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Original k-NN': [train_accuracy_original, test_accuracy_original, precision_original, recall_original, f1_original],
    'Tuned k-NN': [train_accuracy_tuned, test_accuracy_tuned, precision_tuned, recall_tuned, f1_tuned]
})

# Set precision for floating point numbers
pd.set_option("display.precision", 4)

# Display the comparison table
display(comparison_df)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))  # Adjust figure size as needed

# Plot Confusion Matrix for Original k-NN Model
sns.heatmap(confusion_matrix_original, annot=True, fmt='d', cmap="Blues", xticklabels=class_names, yticklabels=class_names, ax=axes[0])
axes[0].set_title('Original k-NN', fontsize=14)
axes[0].set_xlabel('Predicted Class', fontsize=12)
axes[0].set_ylabel('True Class', fontsize=12)

# Plot Confusion Matrix for Tuned k-NN Model
sns.heatmap(confusion_matrix_tuned, annot=True, fmt='d', cmap="Blues", xticklabels=class_names, yticklabels=class_names, ax=axes[1])
axes[1].set_title('Tuned k-NN', fontsize=14)
axes[1].set_xlabel('Predicted Class', fontsize=12)
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()