In [None]:
# Jeffery Smith

In [15]:
# --- imports for KNN Model
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [26]:
# ChatGPT heavily refactored

# --- helper ---
def accuracy(k, X_train, y_train, X_test, y_test):
    """
    Compute accuracy of a KNN classifier for the given k.
    """
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    return accuracy_score(y_test, pred)

# --- Load training data ---
train_df = pd.read_csv('train.csv')

# Encode churn column as binary (robust to capitalization/whitespace)
train_df["churn_encoded"] = (
    train_df["churn"].astype(str).str.strip().str.lower().map({"no": 0, "yes": 1})
)

# Features (predictors) — coerce to numeric to avoid accidental object dtypes
feature_cols = [
    'total_day_minutes', 'total_eve_minutes', 'total_night_minutes',
    'total_intl_minutes', 'total_day_calls', 'total_eve_calls',
    'total_night_calls', 'number_customer_service_calls'
]
X = train_df[feature_cols].apply(pd.to_numeric, errors='coerce')

# Target (0/1)
y = pd.to_numeric(train_df['churn_encoded'], errors='coerce')

# Drop any rows with missing values in X or y to keep train/test split happy
clean = pd.concat([X, y.rename('churn_encoded')], axis=1).dropna()
X = clean[feature_cols]
y = clean['churn_encoded'].astype(int)

# --- Split the data into training and testing sets ---
# Use stratify to preserve churn class balance when possible
classes, counts = np.unique(y, return_counts=True)
stratify_arg = y if (len(classes) > 1 and counts.min() >= 2) else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_arg
)

# --- Train the KNN model ---
k_value = 5
knn_model = KNeighborsClassifier(n_neighbors=k_value)
knn_model.fit(X_train, y_train)

# --- Make predictions ---
y_pred = knn_model.predict(X_test)

# --- Call the accuracy function (fits a fresh model for the same split) ---
test_accuracy = accuracy(k_value, X_train, y_train, X_test, y_test)

# --- Evaluate the model ---
print(f"Accuracy (via accuracy()): {test_accuracy:.3f}")
print(f"Accuracy (via y_pred):     {accuracy_score(y_test, y_pred):.3f}")
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=3))


Accuracy (via accuracy()): 0.872
Accuracy (via y_pred):     0.872

Confusion matrix:
[[706  24]
 [ 85  35]]

Classification report:
              precision    recall  f1-score   support

           0      0.893     0.967     0.928       730
           1      0.593     0.292     0.391       120

    accuracy                          0.872       850
   macro avg      0.743     0.629     0.660       850
weighted avg      0.850     0.872     0.852       850

