In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] ='/content'
!kaggle datasets download -d uciml/breast-cancer-wisconsin-data
!unzip \*.zip && rm *.zip

Downloading breast-cancer-wisconsin-data.zip to /content
  0% 0.00/48.6k [00:00<?, ?B/s]
100% 48.6k/48.6k [00:00<00:00, 49.8MB/s]
Archive:  breast-cancer-wisconsin-data.zip
  inflating: data.csv                


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Step 1: Load Data
data = pd.read_csv('/content/data.csv')

# Assuming 'diagnosis' is the target variable
X = data[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
          'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean']]
y = data['diagnosis']  # Target variable

# Step 2: Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Calculate similarity matrix
def calculate_similarity_matrix(X):
    n_samples = X.shape[0]
    similarity_matrix = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        for j in range(n_samples):
            similarity_matrix[i][j] = np.dot(X[i], X[j]) / (np.linalg.norm(X[i]) * np.linalg.norm(X[j]))
    return similarity_matrix

# Step 4: Define adaptive threshold
def calculate_adaptive_threshold(similarity_matrix):
    return np.mean(similarity_matrix)

# Step 5: Compute weighted distances
def compute_weighted_distances(x_test, X_train, similarity_matrix):
    weighted_distances = []
    for x_train in X_train:
        similarity = np.dot(x_test, x_train) / (np.linalg.norm(x_test) * np.linalg.norm(x_train))
        weighted_distance = np.linalg.norm(x_test - x_train) / similarity
        weighted_distances.append(weighted_distance)
    return weighted_distances

# Step 6: Classify test instances
def classify_test_instances(X_train, y_train, X_test, k, similarity_matrix, adaptive_threshold):
    y_pred = []
    for x_test in X_test:
        weighted_distances = compute_weighted_distances(x_test, X_train, similarity_matrix)
        nearest_indices = np.argsort(weighted_distances)[:k]
        nearest_labels = [y_train[i] for i in nearest_indices]
        count_M = nearest_labels.count('M')
        count_B = nearest_labels.count('B')
        predicted_label = 'M' if count_M > count_B else 'B'
        y_pred.append(predicted_label)
    return y_pred

# Step 7: Apply EAKNN algorithm
similarity_matrix = calculate_similarity_matrix(X_train.to_numpy())
adaptive_threshold = calculate_adaptive_threshold(similarity_matrix)
k = 5  # Number of neighbors
y_pred = classify_test_instances(X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), k, similarity_matrix, adaptive_threshold)

# Step 8: Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9385964912280702
Precision: 0.9409048938134812
Recall: 0.9385964912280702
F1 Score: 0.937745598564312
Classification Report:
               precision    recall  f1-score   support

           B       0.92      0.99      0.95        71
           M       0.97      0.86      0.91        43

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114

