In [115]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score


In [116]:
df = pd.read_csv("cancer.csv")
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [117]:
df = df.drop("STDs: Time since first diagnosis", axis=1)
df = df.drop("STDs: Time since last diagnosis", axis=1)
df.replace("?", pd.NA, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
2,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,0.0,0,1,0,1,0,0,0,0,0
3,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
4,42,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0


In [118]:
for column in df.columns:
    df[column] = pd.to_numeric(df[column])
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
2,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,0.0,0,1,0,1,0,0,0,0,0
3,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
4,42,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0


In [119]:
def calculate_accuracy(inputted_predictions, actual):
    correct = sum(inputted_predictions == actual)
    total = len(actual)
    accuracy = correct / total
    return accuracy


y = df['Dx:HPV']


# Extract numeric columns
numeric_cols = df.select_dtypes(include=['int', 'float']).columns


# Initialize the scaler
scaler = StandardScaler()  # Or MinMaxScaler()


# Scale the numeric columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


# Separate features and target variable
X = df.drop('Dx:HPV', axis=1)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# K-Nearest Neighbors


class KNN:
    def __init__(self, k):
        self.k = k


    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        self.predictions = []
        for x_test in X:
            distances = np.sqrt(np.sum((self.X_train - x_test) ** 2, axis=1))
            nearest_neighbors = np.argsort(distances)[:self.k]
            knn_labels = self.y_train[nearest_neighbors].astype(int)
            most_common = np.argmax(np.bincount(knn_labels))
            if most_common > 0.5:
                self.predictions.append(1)
            else:
                self.predictions.append(most_common)
        return self.predictions


# Instantiate the KNN  classifier
knn = KNN(k=5)


# Fit the model to the training data
knn.fit(X_train.values, y_train.values)


# Instantiate the KNN classifier
knn = KNN(25)


# Fit the model to the training data
knn.fit(X_train.values, y_train.values)


In [120]:
knn_test_predictions = knn.predict(X_test.values)
knn_test_accuracy = accuracy_score(knn_test_predictions, y_test.values)
print("K-Nearest Neighbors Test Accuracy:", knn_test_accuracy)


K-Nearest Neighbors Test Accuracy: 0.9850746268656716


In [121]:
knn_train_predictions = knn.predict(X_train.values)
knn_train_accuracy = accuracy_score(knn_train_predictions, y_train.values)
print("K-Nearest Neighbors Train Accuracy:", knn_train_accuracy)

knn_test_accuracy = calculate_accuracy(knn_test_predictions, y_test.values)
print("K-Nearest Neighbors Test Accuracy (Alternate Method):", knn_test_accuracy)
knn_train_accuracy = calculate_accuracy(knn_train_predictions, y_train.values)
print("K-Nearest Neighbors Train Accuracy (Alternate Method):", knn_train_accuracy)

K-Nearest Neighbors Train Accuracy: 0.9737827715355806
K-Nearest Neighbors Test Accuracy (Alternate Method): 0.9850746268656716
K-Nearest Neighbors Train Accuracy (Alternate Method): 0.9737827715355806
