In [99]:
# Kelompok 1
# Nama Anggota
# 1. NANDA PUTRI RAHMAWATI (2011016320021)
# 2. HELMA MUKIMAH (2211016220008)
# 3. NORKHADIJAH (2211016220030)
# 4. FAUZAN SAPUTRA (2211016310003)
# Link GDrive data dan output = https://drive.google.com/drive/folders/1-dgVW5mK2UjWzQTQIz2j16YxZ6oczWcy?usp=drive_link

In [100]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [101]:
S = pd.read_csv('lung_cancer_examples.csv')
S = S.values

In [102]:
X = S[:, 2:6]  # Input features (Age, Smokes, AreaQ, Alkhol)
y = S[:, 6]    # Labels (Result)

# Ensure that the labels are integers
y = y.astype(int)

In [103]:
# Initialize K-Fold cross-validation
kf = KFold(n_splits=3, random_state=0, shuffle=True)

print(kf)

KFold(n_splits=3, random_state=0, shuffle=True)


In [104]:
# Loop over different values of k
results = {}  # Dictionary to store results for each k

for k in range(1, 6):
    neigh2 = KNeighborsClassifier(n_neighbors=k, metric='euclidean')

    # Initialize arrays to store accuracy, precision, and recall for each fold
    avg_acc = np.zeros(5)
    avg_pre = np.zeros(5)
    avg_rec = np.zeros(5)

    # Cross-validation loop
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the classifier
        neigh2.fit(X_train, y_train)

        # Make predictions
        y_pred = neigh2.predict(X_test)

        # Calculate and store metrics
        avg_acc[i] = accuracy_score(y_test, y_pred)
        avg_pre[i] = precision_score(y_test, y_pred, average='macro')
        avg_rec[i] = recall_score(y_test, y_pred, average='macro')

        # Print classification report for the current fold
        print(f"k = {k}, Fold {i+1} classification report:\n", classification_report(y_test, y_pred))

    # Store the average metrics for the current value of k
    results[k] = {
        "average_accuracy": np.mean(avg_acc),
        "average_precision": np.mean(avg_pre),
        "average_recall": np.mean(avg_rec)
    }

k = 1, Fold 1 classification report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.89      0.94         9

    accuracy                           0.95        20
   macro avg       0.96      0.94      0.95        20
weighted avg       0.95      0.95      0.95        20

k = 1, Fold 2 classification report:
               precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.92      1.00      0.96        12

    accuracy                           0.95        20
   macro avg       0.96      0.94      0.95        20
weighted avg       0.95      0.95      0.95        20

k = 1, Fold 3 classification report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92        12
           1       0.86      0.86      0.86         7

    accuracy                           0.89        19
   macro avg  

k = 2, Fold 1 classification report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.89      0.94         9

    accuracy                           0.95        20
   macro avg       0.96      0.94      0.95        20
weighted avg       0.95      0.95      0.95        20

k = 2, Fold 2 classification report:
               precision    recall  f1-score   support

           0       0.70      0.88      0.78         8
           1       0.90      0.75      0.82        12

    accuracy                           0.80        20
   macro avg       0.80      0.81      0.80        20
weighted avg       0.82      0.80      0.80        20

k = 2, Fold 3 classification report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88        12
           1       0.83      0.71      0.77         7

    accuracy                           0.84        19
   macro avg  

In [105]:
# Print results for each k and determine the best k based on average accuracy
best_k = 1
best_accuracy = 0

for k in results:
    print(f"Results for k = {k}:")
    print("Average Accuracy: ", results[k]["average_accuracy"])
    print("Average Precision: ", results[k]["average_precision"])
    print("Average Recall: ", results[k]["average_recall"])
    print()
    # Determine the best k based on the highest average accuracy
    if results[k]["average_accuracy"] > best_accuracy:
        best_accuracy = results[k]["average_accuracy"]
        best_k = k

print(f"The best k is {best_k} with an average accuracy of {best_accuracy}")

Results for k = 1:
Average Accuracy:  0.5589473684210526
Average Precision:  0.5613553113553114
Average Recall:  0.5537698412698413

Results for k = 2:
Average Accuracy:  0.5184210526315789
Average Precision:  0.5196153846153846
Average Recall:  0.5144841269841269

Results for k = 3:
Average Accuracy:  0.5484210526315789
Average Precision:  0.5471153846153846
Average Recall:  0.5422619047619047

Results for k = 4:
Average Accuracy:  0.4978947368421053
Average Precision:  0.5027633477633477
Average Recall:  0.49186507936507934

Results for k = 5:
Average Accuracy:  0.5278947368421052
Average Precision:  0.5272582972582972
Average Recall:  0.5196428571428571

The best k is 1 with an average accuracy of 0.5589473684210526


In [106]:
# Test new data
# Example test data
new_data = np.array([[43 , 17, 6, 0]])  # Example new data based on provided format

# Print prediction for new data with the best k
# neigh2 = KNeighborsClassifier(n_neighbors=best_k, metric='euclidean')
# neigh2.fit(X, y)  # Fit the model on the entire dataset
# new_prediction = neigh2.predict(new_data)
# print(f"Prediction for new data with k = {best_k}:", new_prediction)

# Loop k = 1 to k = 5 to predict the class for the new data
for k in range(1, 6):
    # Create the KNeighborsClassifier with k neighbors
    neigh2 = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    
    # Fit the model on the entire dataset
    neigh2.fit(X, y)
    
    # Predict the class for the new data
    new_prediction = neigh2.predict(new_data)
    
    # Determine label based on prediction
    if new_prediction == 1:
        label = 'Cancer'
    else:
        label = 'Non Cancer'
    
    # Print the prediction result
    print(f"Prediction for new data with k = {k}:", label)

Prediction for new data with k = 1: Non Cancer
Prediction for new data with k = 2: Non Cancer
Prediction for new data with k = 3: Non Cancer
Prediction for new data with k = 4: Non Cancer
Prediction for new data with k = 5: Non Cancer
