Jeffery Smith

k-Nearest Neighbors Models

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, roc_auc_score, RocCurveDisplay

In [16]:
#  Accuracy test for K and how my 7 parameter was chosen going forward.

def accuracy(k, X_train, y_train, X_test, y_test):
    
    # instantiate learning model and fit data
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)

    
    # predict the response
    pred = knn.predict(X_test)


    # evaluate and return  accuracy
    return accuracy_score(y_test, pred)


# Load the dataset
train_df = pd.read_csv('train.csv')


# Encode churn column as binary
train_df["churn_encoded"] = train_df["churn"].map({"no": 0, "yes": 1})


x = train_df[['total_day_minutes', 'total_eve_minutes', 'total_night_minutes',
              'total_intl_minutes', 'total_day_calls', 'total_eve_calls',
              'total_night_calls', 'number_customer_service_calls']]

# Target of what we are trying to predict 
y = train_df['churn_encoded']  # Targetting the 1:0 yes:no frame



# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

# 3. Call the accuracy function with a chosen value for k
k_value = 7
test_accuracy = accuracy(k_value, X_train, y_train, X_test, y_test)

# 4. Print the result
print(f"The accuracy for k = {k_value} is: {test_accuracy:.2f}")

The accuracy for k = 7 is: 0.88


In [14]:
# Code provided by Google for sanity check
# --- Example of finding the best k ---

# You can also use the function to find the best k
k_range = range(1, 10)
scores = {}
for k in k_range:
    scores[k] = accuracy(k, X_train, y_train, X_test, y_test)

best_k = max(scores, key=scores.get)
print(f"\nThe best k value is {best_k} with an accuracy of {scores[best_k]:.2f}")  


The best k value is 7 with an accuracy of 0.88


In [9]:
# K-Nearest Neighbors (Euclidean)
# Using Classifier KNN for churn_encoded

# Creating function for Accuracy test of K in training data.
def accuracy(k, X_train, y_train, X_test, y_test):
    
    # instantiate learning model and fit data
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)

    
    # predict the response
    pred = knn.predict(X_test)


    # evaluate and return accuracy
    return accuracy_score(y_test, pred)


# Load training data
train_df = pd.read_csv('train.csv')

# Encode churn column as binary
train_df["churn_encoded"] = train_df["churn"].map({"no": 0, "yes": 1})

# Features (predictors)
X = train_df[['total_day_minutes', 'total_eve_minutes', 'total_night_minutes',
              'total_intl_minutes', 'total_day_calls', 'total_eve_calls',
              'total_night_calls', 'number_customer_service_calls']]

# Target of what we are trying to predict 
y = train_df['churn_encoded']  # Targetting the 1:0 yes:no frame

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

# Train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=7) # 'k' value
knn_model.fit(X_train, y_train)

# Make predictions
y_pred = knn_model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.885
[[707  15]
 [ 83  45]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.94       722
           1       0.75      0.35      0.48       128

    accuracy                           0.88       850
   macro avg       0.82      0.67      0.71       850
weighted avg       0.87      0.88      0.87       850

