# Part 1: Data Preprocessing

dataset link: [Breast Cancer Wisconsin](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data)

## Importing the libraries and dataset

In [None]:
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

In [None]:
dataset = pd.read_csv('./data.csv')

In [None]:
dataset.head()

## Dealing dataset

In [None]:
dataset = dataset.drop(labels="Unnamed: 32", axis=1)

In [None]:
dataset.head()

In [None]:
for i in range(0, dataset.shape[0]):
    if dataset.iloc[i, 1] == 'M':
        dataset.iloc[i, 1] = 1
    else:
        dataset.iloc[i, 1] = 0

In [None]:
dataset.head(20)

## Splitting the training data and testing data

In [None]:
x_set = dataset.iloc[:, 2:].values

In [None]:
y_set = dataset.iloc[:, 1].values

In [None]:
x_set.shape, y_set.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.3)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

## Feature scaling

In [None]:
sc = StandardScaler()

In [None]:
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train

In [None]:
x_test

# Part 2: Building the model

## Logistic regression

In [None]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x.astype(float)))

In [None]:
def model(x, w, b):
  out = sigmoid(np.dot(x, w) + b)
  return out

In [None]:
# Binary cross entropy
def bce_loss(yhat, y):
    m = len(y)
    eps = 1e-9
    return -np.sum(y * np.log(yhat + eps) + (1 - y) * np.log(1 - yhat + eps)) / m

In [None]:
def gradient(x, y_hat, y):
  m = len(y)
  grad_w = np.dot(x.T, y_hat-y) / m
  grad_b = np.sum(y_hat-y) / m
  return grad_w, grad_b

In [None]:
def train_model(x, y, n_epoch=100000, lr=1e-3):
  m, d = x.shape
  w = np.zeros(d)
  b = np.zeros(1)

  for epoch in range(n_epoch):
    y_hat = model(x, w, b)
    loss = bce_loss(y_hat, y)
    
    grad_w, grad_b = gradient(x, y_hat, y)

    w = w - lr * grad_w
    b = b - lr * grad_b

    if epoch % (n_epoch/100) == 0 or epoch == n_epoch-1:
      print(f"epoch {epoch}: train_loss = {loss:.4f}")

  return w, bs

In [324]:
w_optim, b_optim = train_model(x_train, y_train)

epoch 26000: train_loss = 0.4164
epoch 26500: train_loss = 0.4138
epoch 27000: train_loss = 0.4112
epoch 27500: train_loss = 0.4086
epoch 28000: train_loss = 0.4061
epoch 28500: train_loss = 0.4037


In [None]:
def test_trained_model(x_test, y_test, w, b):
  y_hat = model(x_test, w, b)
  loss = bce_loss(y_hat, y_test)
  print(f"test_loss = {loss:.4f}")

In [None]:
test_trained_model(x_test, y_test, w_optim, b_optim)

In [None]:
def accuracy(y_hat, y):
    threshold = 0.9
    acc = 1
    for i in range(len(y)):
        if y[i] == 1:
            if y_hat[i] >= threshold:
                y_hat[i] = y[i]
                acc += 1
        else:
            if y_hat[i] < threshold:
                y_hat[i] = y[i]
                acc += 1
    return acc / len(y) * 100

In [None]:
y_hat = model(x_test, w_optim, b_optim)

In [None]:
accuracy(y_hat, y_test)