In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] ='/content'
!kaggle datasets download -d thedevastator/cancer-patients-and-air-pollution-a-new-link
!unzip \*.zip && rm *.zip

Downloading cancer-patients-and-air-pollution-a-new-link.zip to /content
  0% 0.00/7.36k [00:00<?, ?B/s]
100% 7.36k/7.36k [00:00<00:00, 12.9MB/s]
Archive:  cancer-patients-and-air-pollution-a-new-link.zip
  inflating: cancer patient data sets.csv  


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load data
data = pd.read_csv('/content/cancer patient data sets.csv')

# Assuming 'Level' is the name of your target variable
X = data.drop(['index', 'Patient Id', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk', 'Level'], axis=1)
y = data['Level']

# Encode categorical variables (Gender)
label_encoder = LabelEncoder()
X['Gender'] = label_encoder.fit_transform(X['Gender'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to dynamically adjust k based on local density
def get_k_density(X_train, X_query, k_min=1, k_max=10):
    k_values = range(k_min, k_max + 1)
    k_best = k_values[1]
    best_score = float('-inf')

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        distances, _ = knn.kneighbors(X_query, n_neighbors=k)
        density = 1 / (distances.sum(axis=1) + 1e-6)  # Avoid division by zero
        score = density.mean()

        if score > best_score:
            best_score = score
            k_best = k

    return k_best

# Predict using LAKNN
def predict_laknn(X_train, y_train, X_test):
    k_values = []
    for query_point in X_test:
        k = get_k_density(X_train, query_point.reshape(1, -1))
        k_values.append(k)
    return k_values

# Get k values for test set
k_values_test = predict_laknn(X_train_scaled, y_train, X_test_scaled)

# Use the best k values to predict
y_pred = []
for k, query_point in zip(k_values_test, X_test_scaled):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred.append(knn.predict(query_point.reshape(1, -1)))

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Specify average='weighted'
recall = recall_score(y_test, y_pred, average='weighted')  # Specify average='weighted'
f1 = f1_score(y_test, y_pred, average='weighted')  # Specify average='weighted'

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
# Display classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Classification Report:
               precision    recall  f1-score   support

        High       1.00      1.00      1.00        83
         Low       1.00      1.00      1.00        53
      Medium       1.00      1.00      1.00        64

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

