In [73]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score


In [74]:
df = pd.read_csv("cancer.csv")
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [75]:
df = df.drop("STDs: Time since first diagnosis", axis=1)
df = df.drop("STDs: Time since last diagnosis", axis=1)
df.replace("?", pd.NA, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
2,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,0.0,0,1,0,1,0,0,0,0,0
3,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
4,42,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0


In [76]:
df.corr()

Unnamed: 0,Age,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
Age,1.0,-0.009647,0.105179,-0.022579,0.092913,0.039025,-0.030625,0.084757,-0.034654,0.056209
STDs: Number of diagnosis,-0.009647,1.0,-0.017712,-0.020102,-0.015313,-0.015313,0.074947,0.118138,0.069592,0.092889
Dx:Cancer,0.105179,-0.017712,1.0,-0.010854,0.907229,0.78289,0.148549,0.143002,0.121929,0.184112
Dx:CIN,-0.022579,-0.020102,-0.010854,1.0,-0.010522,0.428759,-0.014565,-0.021674,-0.016725,0.07127
Dx:HPV,0.092913,-0.015313,0.907229,-0.010522,1.0,0.679831,0.155126,0.150449,0.128023,0.192264
Dx,0.039025,-0.015313,0.78289,0.428759,0.679831,1.0,0.107852,0.083449,0.128023,0.153203
Hinselmann,-0.030625,0.074947,0.148549,-0.014565,0.155126,0.107852,1.0,0.64725,0.161802,0.518468
Schiller,0.084757,0.118138,0.143002,-0.021674,0.150449,0.083449,0.64725,1.0,0.378412,0.73068
Citology,-0.034654,0.069592,0.121929,-0.016725,0.128023,0.128023,0.161802,0.378412,1.0,0.340635
Biopsy,0.056209,0.092889,0.184112,0.07127,0.192264,0.153203,0.518468,0.73068,0.340635,1.0


In [77]:
def calculate_accuracy(inputted_predictions, actual):
    correct = sum(inputted_predictions == actual)
    total = len(actual)
    accuracy = correct / total
    return accuracy


# Separate features and target variable
X = df.drop('Dx:HPV', axis=1)
y = df['Dx:HPV']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


class KNN:
    def __init__(self, k=3):
        self.k = k


    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predictions = []
        for x_test in X:
            distances = np.sqrt(np.sum((self.X_train - x_test) ** 2, axis=1))
            nearest_neighbors = np.argsort(distances)[:self.k]
            knn_labels = self.y_train[nearest_neighbors]
            most_common = np.argmax(np.bincount(knn_labels))
            predictions.append(most_common)
        return predictions


# Instantiate the KNN  classifier
knn = KNN(k=5)


# Fit the model to the training data
knn.fit(X_train.values, y_train.values)


In [71]:
knn_test_predictions = knn.predict(X_test.values)
knn_test_accuracy = accuracy_score(knn_test_predictions, y_test.values)
print("K-Nearest Neighbors Test Accuracy:", knn_test_accuracy)


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [78]:
X_train_filled_aligned = X_train.reindex(columns=X_train.columns, fill_value=0)
knn_train_predictions = knn.predict(X_train_filled_aligned.values)
knn_train_accuracy = accuracy_score(knn_train_predictions, y_train.values)
print("K-Nearest Neighbors Train Accuracy:", knn_train_accuracy)


knn_test_accuracy = calculate_accuracy(knn_test_predictions, y_test.values)
print("K-Nearest Neighbors Test Accuracy (Alternate Method):", knn_test_accuracy)
knn_train_accuracy = calculate_accuracy(knn_train_predictions, y_train.values)
print("K-Nearest Neighbors Train Accuracy (Alternate Method):", knn_train_accuracy)


TypeError: unsupported operand type(s) for -: 'str' and 'str'