In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn.datasets import load_breast_cancer

##  Loads the breast cancer dataset — it’s built into scikit-learn.
## This dataset includes features of tumors and their diagnosis (benign or malignant).

In [3]:
 from sklearn.model_selection import train_test_split
## Split the data

In [5]:
from sklearn.neighbors import KNeighborsClassifier
## k-Nearest Neighbors model, which makes predictions based on the "closest" examples in the training data.

In [6]:
from sklearn.metrics import accuracy_score
## measures accuracy i.e. how many predictions were correct from our model

In [10]:
cancer = load_breast_cancer()
#cancer.data: the features (radius, texture, etc.)

#cancer.target: labels (0 = malignant, 1 = benign)

#cancer.feature_names: names of the columns

#cancer.target_names: name of the labels


In [13]:
# Step 2: Convert to DataFrame
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target  # 0 = malignant, 1 = benign

## We convert the data which inclues the feature names to a dataframe 

# The feature columns go into df. We add the target column — which contains the label: 0 (malignant), 1 (benign)

In [14]:
# Step 3: Split into features and labels
X = df[cancer.feature_names] ##  input features (used for prediction)
y = df['target'] ## labels (what we want to predict)


In [16]:
# Step 4: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
## Following 80-20 logic, the test data is 20% and the training data is 80%

In [17]:
# Step 5: Create and train model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

## Then we train it using .fit() on the training data.

## So now the model "knows" what benign and malignant tumors look like.

In [18]:
# Step 6: Predict
y_pred = knn.predict(X_test)
## The model now predicts labels (0 or 1) for the test data.

In [21]:
# Step 7: Comparison DataFrame
comparison = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})
## We create a table comparing the actual label to the model’s guess.

In [22]:
# Optional: Add labels (0 → malignant, 1 → benign)
target_names = cancer.target_names
comparison['Actual_Label'] = comparison['Actual'].map(lambda i: target_names[i])
comparison['Predicted_Label'] = comparison['Predicted'].map(lambda i: target_names[i])
## This converts 0/1 labels into human-readable form:

# 0 → "malignant"

# 1 → "benign"

In [23]:
# Print results
print(comparison.head(10))


   Actual  Predicted Actual_Label Predicted_Label
0       1          1       benign          benign
1       0          0    malignant       malignant
2       0          0    malignant       malignant
3       1          1       benign          benign
4       1          1       benign          benign
5       0          0    malignant       malignant
6       0          0    malignant       malignant
7       0          0    malignant       malignant
8       1          1       benign          benign
9       1          1       benign          benign


In [24]:
# Print overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")



Model Accuracy: 0.96


In [27]:
## printing accurace next to each prediction 

# Add readable labels
target_names = cancer.target_names  # ['malignant', 'benign']
comparison['Actual_Label'] = comparison['Actual'].map(lambda i: target_names[i])
comparison['Predicted_Label'] = comparison['Predicted'].map(lambda i: target_names[i])

# Add per-row accuracy column (1 = correct, 0 = incorrect)
comparison['Accuracy'] = (comparison['Actual'] == comparison['Predicted']).astype(int)

# OPTIONAL: Fancy version with ✅ / ❌
comparison['Accuracy'] = comparison['Accuracy'].map({1: '✅', 0: '❌'})

# Show first 10 rows
print(comparison.head(10))

# Print overall accuracy
overall_accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {overall_accuracy:.2f}")

   Actual  Predicted Actual_Label Predicted_Label Accuracy
0       1          1       benign          benign        ✅
1       0          0    malignant       malignant        ✅
2       0          0    malignant       malignant        ✅
3       1          1       benign          benign        ✅
4       1          1       benign          benign        ✅
5       0          0    malignant       malignant        ✅
6       0          0    malignant       malignant        ✅
7       0          0    malignant       malignant        ✅
8       1          1       benign          benign        ✅
9       1          1       benign          benign        ✅

Model Accuracy: 0.96


In [32]:
### Best practice: Use cross-validation to find the best k
from sklearn.model_selection import cross_val_score

for k in range(1, 100, 2):  # try odd k from 1 to 19
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5)  # 5-fold cross-validation
    print(f"k={k}, Average Accuracy: {scores.mean():.2f}")


k=1, Average Accuracy: 0.91
k=3, Average Accuracy: 0.92
k=5, Average Accuracy: 0.93
k=7, Average Accuracy: 0.93
k=9, Average Accuracy: 0.93
k=11, Average Accuracy: 0.93
k=13, Average Accuracy: 0.93
k=15, Average Accuracy: 0.93
k=17, Average Accuracy: 0.93
k=19, Average Accuracy: 0.93
k=21, Average Accuracy: 0.93
k=23, Average Accuracy: 0.92
k=25, Average Accuracy: 0.92
k=27, Average Accuracy: 0.92
k=29, Average Accuracy: 0.92
k=31, Average Accuracy: 0.92
k=33, Average Accuracy: 0.92
k=35, Average Accuracy: 0.92
k=37, Average Accuracy: 0.92
k=39, Average Accuracy: 0.92
k=41, Average Accuracy: 0.92
k=43, Average Accuracy: 0.92
k=45, Average Accuracy: 0.92
k=47, Average Accuracy: 0.92
k=49, Average Accuracy: 0.91
k=51, Average Accuracy: 0.92
k=53, Average Accuracy: 0.91
k=55, Average Accuracy: 0.92
k=57, Average Accuracy: 0.91
k=59, Average Accuracy: 0.91
k=61, Average Accuracy: 0.91
k=63, Average Accuracy: 0.91
k=65, Average Accuracy: 0.91
k=67, Average Accuracy: 0.91
k=69, Average Accur