
# Simple KNN Classification + KD-Tree (Video Game Sales)

This notebook is **minimal** and meant for **Google Colab**.  
You will upload `vgsales.csv`, then we will:
1. Clean and prepare the data (simple scaling + one-hot for `Platform`)
2. Train a KNN **classifier** for `Genre`
3. Plot **confusion matrix**, **genre distribution**, and a **validation curve** for different `k`
4. Build a **KD-Tree** on the (scaled+encoded) training features and inspect neighbors
5. Compare KNN backends (`brute`, `kd_tree`, `ball_tree`, `auto`) with a simple timing and **bar plots**

> No regression code is included.


In [None]:

# Basic imports (no seaborn; only matplotlib for plots)
import io, time, math, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KDTree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# For Colab file upload
try:
    from google.colab import files
    IN_COLAB = True
except Exception:
    IN_COLAB = False

# Reproducibility
np.random.seed(7)
random.seed(7)

print("Ready. In Colab:", IN_COLAB)


In [None]:

# === Upload the CSV ===
if IN_COLAB:
    print("Please upload vgsales.csv")
    uploaded = files.upload()  # choose the CSV
    csv_name = next(iter(uploaded.keys()))
    df = pd.read_csv(io.BytesIO(uploaded[csv_name]))
else:
    # If not in Colab, set your path here
    CSV_PATH = "vgsales.csv"  # change if needed
    df = pd.read_csv(CSV_PATH)

print("Loaded shape:", df.shape)
print(df.head(3))


In [None]:

# === Simple cleaning ===
need = ["Year", "Genre", "Platform", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
df = df[need].dropna().copy()

def to_int_ok(x):
    try: return int(float(x))
    except: return np.nan

df["Year"] = df["Year"].apply(to_int_ok)
df = df.dropna(subset=["Year"])
df["Year"] = df["Year"].astype(int)

# Optional clip
df = df[(df["Year"] >= 1980) & (df["Year"] <= 2020)]
print("After cleaning:", df.shape)


## Genre Distribution

In [None]:

genre_counts = df["Genre"].value_counts()
plt.figure(figsize=(8,4))
plt.bar(genre_counts.index, genre_counts.values)
plt.xticks(rotation=45, ha="right")
plt.title("Genre Distribution")
plt.tight_layout()
plt.show()


## Train/Test Split and Simple Preprocessing

In [None]:

num_cols = ["Year", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
cat_col  = "Platform"
y = df["Genre"].copy()

idx_train, idx_test = train_test_split(df.index, test_size=0.25, stratify=y, random_state=42)
train_df, test_df = df.loc[idx_train].copy(), df.loc[idx_test].copy()
y_train, y_test   = y.loc[idx_train].copy(), y.loc[idx_test].copy()

# Scale numeric features (fit on train only)
scaler = StandardScaler().fit(train_df[num_cols])
Xtr_num = scaler.transform(train_df[num_cols])
Xte_num = scaler.transform(test_df[num_cols])

# One-hot for Platform (train, then align test)
train_cat = pd.get_dummies(train_df[[cat_col]].astype(str), drop_first=False)
test_cat  = pd.get_dummies(test_df[[cat_col]].astype(str),  drop_first=False)
test_cat  = test_cat.reindex(columns=train_cat.columns, fill_value=0)

# Concatenate
import numpy as np
X_train = np.hstack([Xtr_num, train_cat.values])
X_test  = np.hstack([Xte_num,  test_cat.values])

print("X_train:", X_train.shape, "| X_test:", X_test.shape)


## KNN Classification (k=11, distance weights)

In [None]:

knn = KNeighborsClassifier(n_neighbors=11, weights="distance", metric="minkowski", p=2, algorithm="auto")
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", round(acc, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))

labels = sorted(y.unique())
cm = confusion_matrix(y_test, y_pred, labels=labels)
disp = ConfusionMatrixDisplay(cm, display_labels=labels)
plt.figure(figsize=(8,6))
disp.plot(xticks_rotation=45)
plt.title("Confusion Matrix (Test)")
plt.tight_layout()
plt.show()


## Validation Curve: Accuracy vs k (simple test-set sweep)

In [None]:

k_list = [1,3,5,7,9,11,13,15,17,21,25]
accs = []
for k in k_list:
    m = KNeighborsClassifier(n_neighbors=k, weights="distance", metric="minkowski", p=2, algorithm="auto")
    m.fit(X_train, y_train)
    accs.append(accuracy_score(y_test, m.predict(X_test)))

plt.figure(figsize=(6,4))
plt.plot(k_list, accs, marker="o")
plt.xlabel("k (neighbors)")
plt.ylabel("Accuracy (test)")
plt.title("Accuracy vs k")
plt.grid(True)
plt.tight_layout()
plt.show()

best_k = k_list[int(np.argmax(accs))]
print("Best k (by this simple sweep):", best_k, "with acc =", round(max(accs), 4))


## KD-Tree Neighbor Checks

In [None]:

tree = KDTree(X_train, leaf_size=40, metric="euclidean")

np.random.seed(7)
print("--- KD-Tree neighbors for 3 random test samples ---")
for idx in np.random.choice(len(X_test), size=min(3, len(X_test)), replace=False):
    q = X_test[idx:idx+1]
    dist, nbrs = tree.query(q, k=5, return_distance=True)
    print(f"\nTest sample (local idx): {idx}")
    print("True:", y_test.iloc[idx], "| Pred:", y_pred[idx])
    print("Neighbor distances:", np.round(dist[0], 3))
    print("Neighbor labels:", y_train.iloc[nbrs[0]].to_list())


## Backend Timing: brute vs kd_tree vs ball_tree vs auto

In [None]:

def time_backend(algo):
    m = KNeighborsClassifier(n_neighbors=11, weights="distance", algorithm=algo)
    t0 = time.perf_counter(); m.fit(X_train, y_train); fit_s = time.perf_counter()-t0
    t0 = time.perf_counter(); p = m.predict(X_test);  pred_s = time.perf_counter()-t0
    acc = accuracy_score(y_test, p)
    return fit_s, pred_s, acc

results = {}
for algo in ["brute", "kd_tree", "ball_tree", "auto"]:
    fit_s, pred_s, a = time_backend(algo)
    results[algo] = {"fit": fit_s, "pred": pred_s, "acc": a}
    print(f"{algo:8s} | fit {fit_s:6.3f}s | pred {pred_s:6.3f}s | acc {a:.4f}")

# Bar plots (default matplotlib settings)
algos = list(results.keys())
pred_times = [results[a]["pred"] for a in algos]
acc_vals   = [results[a]["acc"]  for a in algos]

plt.figure(figsize=(6,4))
plt.bar(algos, pred_times)
plt.ylabel("Prediction time (s)")
plt.title("Prediction Time by Backend")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
plt.bar(algos, acc_vals)
plt.ylabel("Accuracy")
plt.title("Accuracy by Backend")
plt.tight_layout()
plt.show()
