In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] ='/content'
!kaggle datasets download -d erdemtaha/cancer-data
!unzip \*.zip && rm *.zip

Downloading cancer-data.zip to /content
  0% 0.00/48.6k [00:00<?, ?B/s]
100% 48.6k/48.6k [00:00<00:00, 10.2MB/s]
Archive:  cancer-data.zip
  inflating: Cancer_Data.csv         


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Step 1: Load Data
data = pd.read_csv('/content/Cancer_Data.csv')  # Replace '/path/to/your/dataset.csv' with the actual path

# Step 2: Prepare Data
X = data[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
          'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean']]
y = data['diagnosis']  # Target variable

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Cluster the training data using Hierarchical Clustering
n_clusters = 5  # Number of clusters
hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters)
cluster_labels_train = hierarchical_clustering.fit_predict(X_train_scaled)
cluster_centers = []
for i in range(n_clusters):
    cluster_center = X_train_scaled[cluster_labels_train == i].mean(axis=0)
    cluster_centers.append(cluster_center)

# Step 6: Apply KNN within each cluster
k = 5  # Number of neighbors
knn_models = []
for i in range(n_clusters):
    # Find data points and labels within the cluster
    cluster_indices = np.where(cluster_labels_train == i)[0]
    X_cluster = X_train_scaled[cluster_indices]
    y_cluster = y_train.iloc[cluster_indices]

    # Train KNN model for the cluster
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_cluster, y_cluster)
    knn_models.append(knn_model)

# Step 7: Make predictions
y_pred = []
for x_test_point in X_test_scaled:
    # Find the nearest cluster center
    nearest_cluster_index = np.argmin(np.linalg.norm(cluster_centers - x_test_point, axis=1))

    # Get indices of data points in the nearest cluster
    nearest_cluster_indices = np.where(cluster_labels_train == nearest_cluster_index)[0]

    # Extract data points and labels within the nearest cluster
    X_nearest_cluster = X_train_scaled[nearest_cluster_indices]
    y_nearest_cluster = y_train.iloc[nearest_cluster_indices]

    # Apply KNN to data points within the nearest cluster
    knn_model = knn_models[nearest_cluster_index]
    predicted_class = knn_model.predict([x_test_point])[0]
    y_pred.append(predicted_class)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Display classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9035087719298246
Precision: 0.9088250930356195
Recall: 0.9035087719298246
F1 Score: 0.9043988410104431

Classification Report:
               precision    recall  f1-score   support

           B       0.95      0.89      0.92        71
           M       0.83      0.93      0.88        43

    accuracy                           0.90       114
   macro avg       0.89      0.91      0.90       114
weighted avg       0.91      0.90      0.90       114

