In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
breast_cancer=pd.read_csv('breast_cancer.csv')
breast_cancer['diagnosis']=breast_cancer['diagnosis'].map({'M':1,'B':0})
breast_cancer=breast_cancer.drop(['id'],axis=1)

In [3]:
def train_test_split(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    split_idx = int(X.shape[0] * (1 - test_size))
    X_train, X_test = X[indices[:split_idx]], X[indices[split_idx:]]
    y_train, y_test = y[indices[:split_idx]], y[indices[split_idx:]]
    return X_train, X_test, y_train, y_test



In [4]:
X=breast_cancer.iloc[:, 1:].values
y=breast_cancer['diagnosis'].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [5]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [6]:
def initialize_weights(nfeatures): 
    weights=np.zeros(nfeatures)
    bias=0
    return weights, bias

In [7]:
def compute_cost(X, y,weights,bias):
    m=X.shape[0]
    z=X.dot(weights)+bias
    predictions=sigmoid(z)
    cost=-(1/m)*np.sum(y*np.log(predictions)+(1-y)*np.log(1-predictions))
    return cost

In [8]:
def optimize_weights(X,y,weights,bias,learning_rate,num_iterations):
    m=X.shape[0]
    for i in range (num_iterations):
        z=X.dot(weights)+bias
        predictions=sigmoid(z)
        dw=(1/m)*np.dot(X.T,(predictions-y))
        db=(1/m)*np.sum(predictions-y)
        weights-=learning_rate*dw
        bias-=learning_rate*db
    return weights, bias

In [9]:
def train_logistic_regression(X_train,y_train,learning_rate,num_iterations):
    n_features=X_train.shape[1]
    weights, bias=initialize_weights(n_features)
    weights, bias=optimize_weights(X_train,y_train,weights,bias,learning_rate,num_iterations)
    return weights, bias



In [10]:
def predict_logistic_regression(X,weights,bias):
    z=np.dot(X, weights)+bias
    predictions=sigmoid(z)
    return [1 if p > 0.5 else 0 for p in predictions]

In [11]:
weights, bias=train_logistic_regression(X_train, y_train, learning_rate=0.01, num_iterations=1000)
y_pred_train=predict_logistic_regression(X_train, weights, bias)
y_pred_test=predict_logistic_regression(X_test, weights, bias)


In [12]:
train_accuracy=np.mean(y_pred_train==y_train)
test_accuracy=np.mean(y_pred_test==y_test)
print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Train Accuracy: 0.64
Test Accuracy: 0.59


In [13]:
def euclidean_distance(p1, p2):
    return np.sqrt(np.sum((p1-p2)**2))


In [14]:
def get_neighbors(X_train, y_train, test_instance, k):
    distances=[(euclidean_distance(test_instance, X_train[i]), y_train[i]) for i in range(len(X_train))]
    distances.sort(key=lambda x: x[0])
    neighbors=distances[:k]
    return [label for _, label in neighbors]

In [15]:
from collections import Counter

In [16]:
def predict_kNN(X_train, y_train, X_test, k):
    predictions=[]
    for test_instance in X_test:
        neighbors=get_neighbors(X_train, y_train, test_instance, k)
        majority_vote=Counter(neighbors).most_common(1)[0][0]
        predictions.append(majority_vote)
    return predictions

In [24]:
k=1
pm_y_pred_train=predict_kNN(X_train, y_train, X_train, k)
pm_y_pred_test=predict_kNN(X_train, y_train, X_test, k)

pm_train_accuracy=np.mean(pm_y_pred_train==y_train)
pm_test_accuracy=np.mean(pm_y_pred_test==y_test)
print(f"Train Accuracy (kNN): {pm_train_accuracy:.2f}, Test Accuracy (kNN): {pm_test_accuracy:.2f}")

Train Accuracy (kNN): 0.64, Test Accuracy (kNN): 0.59
