In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# read dataset from a Google Drive File
file_link = 'https://drive.google.com/file/d/1r1zucOO8X-vli134FpfjCsCjw3cb7270/view?usp=sharing' # the file access must have to be Public
id = file_link.split("/")[-2]
new_link = f'https://drive.google.com/uc?id={id}'
print(new_link)
df = pd.read_csv(new_link)

https://drive.google.com/uc?id=1r1zucOO8X-vli134FpfjCsCjw3cb7270


In [3]:
# Explore the dataset (optional)
print(df.head())
print(df.info())

        ID  Clump  UnifSize  UnifShape  MargAdh  SingEpiSize BareNuc  \
0  1000025      5         1          1        1            2       1   
1  1002945      5         4          4        5            7      10   
2  1015425      3         1          1        1            2       2   
3  1016277      6         8          8        1            3       4   
4  1017023      4         1          1        3            2       1   

   BlandChrom  NormNucl  Mit  Class  
0           3         1    1      2  
1           3         2    1      2  
2           3         1    1      2  
3           3         7    1      2  
4           3         1    1      2  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           699 non-null    int64 
 1   Clump        699 non-null    int64 
 2   UnifSize     699 non-null    int64 
 3   UnifShape    699 non-nul

In [4]:
df.head(15)

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [5]:
df.describe()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BlandChrom,NormNucl,Mit,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [6]:
df.nunique()

Unnamed: 0,0
ID,645
Clump,10
UnifSize,10
UnifShape,10
MargAdh,10
SingEpiSize,10
BareNuc,11
BlandChrom,10
NormNucl,10
Mit,9


In [7]:
#change categorical col to numerical col
df.dtypes
df = df[pd.to_numeric(df['BareNuc'],errors='coerce').notnull()].astype('int')
df.dtypes

Unnamed: 0,0
ID,int64
Clump,int64
UnifSize,int64
UnifShape,int64
MargAdh,int64
SingEpiSize,int64
BareNuc,int64
BlandChrom,int64
NormNucl,int64
Mit,int64


In [8]:
X = df.iloc[:, :-1]  # all columns except the last one
y = df.iloc[:, -1]   # the last column

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#check for accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
#can also use classifiation_report metrics for more details

Accuracy: 0.9635036496350365


# **Implementing Logistic Regression from Scratch**





In [9]:
X = df.drop(columns=['Class']).values
y = df['Class'].values
y = np.where(y == 2, 0, 1)  # Convert labels to 0 and 1 if needed (2=benign, 4=malignant)

# Normalize features
X = (X - X.mean(axis=0)) / X.std(axis=0) #axis=0 goes for a particular column
print(X)
# Add bias term
# is independent feature is 0, still allows to classify.
X = np.c_[np.ones(X.shape[0]), X]  # Add bias column (x0 = 1)

# Initialize weights
weights = np.zeros(X.shape[1])

def sigmoid(z):
    return 1 / (1 + np.exp(-z))
    pass

# Loss function (binary cross-entropy)
def compute_loss(y, y_pred):
    epsilon = 1e-10  # to avoid log(0)
    # https://towardsdatascience.com/understanding-binary-cross-entropy-log-loss-a-visual-explanation-a3ac6025181a/
    loss = -np.mean(y * np.log(y_pred + epsilon) + (1 - y) * np.log(1 - y_pred + epsilon))
    return loss

# Gradient descent
def train(X, y, weights, lr=0.1, epochs=1000):
    for i in range(epochs):
        z = np.dot(X, weights)
        # call the sigmoid function on z
        y_pred = sigmoid(z)

        # find the error by calculating the difference between y_pred and y
        error = y_pred - y

        grad = np.dot(X.T, error) / len(y)
        # update the weights using learning rate(lr) and gradient(grad)
        weights -= lr * grad

        if i % 100 == 0:
            loss = compute_loss(y, y_pred)
            print(f"Epoch {i}: Loss = {loss:.4f}")

    return weights

# Train the model
weights = train(X, y, weights)

# Predictions
y_pred = sigmoid(np.dot(X, weights)) >= 0.5
accuracy = np.mean(y_pred == y)
print(f"Final Accuracy: {accuracy:.4f}")

[[-0.12366418  0.19790469 -0.70221201 ... -0.18182716 -0.61292736
  -0.34839971]
 [-0.11895594  0.19790469  0.27725185 ... -0.18182716 -0.28510482
  -0.34839971]
 [-0.09883306 -0.51164337 -0.70221201 ... -0.18182716 -0.61292736
  -0.34839971]
 ...
 [-0.30297227  0.19790469  2.23617957 ...  1.86073779  2.33747554
   0.22916583]
 [-0.2890233  -0.15686934  1.58320366 ...  2.67776377  1.02618536
  -0.34839971]
 [-0.2890233  -0.15686934  1.58320366 ...  2.67776377  0.37054027
  -0.34839971]]
Epoch 0: Loss = 0.6931
Epoch 100: Loss = 0.0987
Epoch 200: Loss = 0.0868
Epoch 300: Loss = 0.0829
Epoch 400: Loss = 0.0809
Epoch 500: Loss = 0.0796
Epoch 600: Loss = 0.0788
Epoch 700: Loss = 0.0781
Epoch 800: Loss = 0.0777
Epoch 900: Loss = 0.0773
Final Accuracy: 0.9707
