In [551]:
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [552]:
data = pd.read_csv("breast_cancer.csv")
classes = {
    0 : "benign",
    1 : "malignant"
}

In [553]:
target = data["diagnosis"].replace(["M", "B"], [1, 0]) # Separate diagnosis (target labels) from the features, and replace them with their arithmetic equivalents (required for the algorithm itself).
data.drop("diagnosis", axis=1, inplace=True)
data.drop("ID", axis=1, inplace=True)

In [554]:
data.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr21,Attr22,Attr23,Attr24,Attr25,Attr26,Attr27,Attr28,Attr29,Attr30
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [555]:
target.head()

0    1
1    1
2    1
3    1
4    1
Name: diagnosis, dtype: int64

In [556]:
# 
print(f"""Before standandization:

{data.iloc[0:5,0:5]}""")

Before standandization:

   Attr1  Attr2   Attr3   Attr4    Attr5
0  17.99  10.38  122.80  1001.0  0.11840
1  20.57  17.77  132.90  1326.0  0.08474
2  19.69  21.25  130.00  1203.0  0.10960
3  11.42  20.38   77.58   386.1  0.14250
4  20.29  14.34  135.10  1297.0  0.10030


In [557]:
# Standardize the dataset. After the standardization, each column in the dataset will have a mean of 0 and a standard deviation of 1. This means that the data is distributed more equally in similar ranges. Some algorithms require this.
scaler = StandardScaler()
data = scaler.fit_transform(data) # Fit data (calculate mean and standard deviation of each column) with data and transform data at the same time

In [558]:
print(f"""After standardization:

{data[0:5,0:5]}""")

After standardization:

[[ 1.09706398 -2.07333501  1.26993369  0.9843749   1.56846633]
 [ 1.82982061 -0.35363241  1.68595471  1.90870825 -0.82696245]
 [ 1.57988811  0.45618695  1.56650313  1.55888363  0.94221044]
 [-0.76890929  0.25373211 -0.59268717 -0.76446379  3.28355348]
 [ 1.75029663 -1.15181643  1.77657315  1.82622928  0.28037183]]


In [559]:
log_reg = LogisticRegression(solver="liblinear")
log_reg.fit(data, target)

LogisticRegression(solver='liblinear')

In [560]:
weights = log_reg.coef_[0] # Model coefficients or weights learned from inputs
bias = log_reg.intercept_[0] # Intercept, also called the bias-term or offset, represents the value of around which the data has a tendency of centering values around. This bias-term is included in the equation to remove any offset, so it data starts at 0.
print(f"""{weights}
{bias}""")

[ 0.35372245  0.3850941   0.34237238  0.44138446  0.15523716 -0.5681635
  0.8685186   0.96811443 -0.07328189 -0.31122062  1.29527365 -0.26995006
  0.6662383   1.02954508  0.2812678  -0.74241788 -0.11352258  0.32006685
 -0.28982672 -0.67152689  1.0304876   1.3131883   0.82563973  1.02915516
  0.67185301 -0.04896119  0.87162239  0.91131563  0.8839543   0.48354624]
-0.17993455017700494


In [561]:
data_test = data[49:50,:]
prediction_test = log_reg.predict(data_test)
print(f"""Actual label of sample: {target[49]}, {classes[target[49]]}
Predicted label of sample: {prediction_test[0]}, {classes[prediction_test[0]]}
""")

data_test = data[99:100,:]
prediction_test = log_reg.predict(data_test)
print(f"""Actual label of sample: {target[99]}, {classes[target[99]]}
Predicted label of sample: {prediction_test[0]}, {classes[prediction_test[0]]}
""")

data_test = data[73:74,:]
prediction_test = log_reg.predict(data_test)
print(f"""Actual label of sample: {target[73]}, {classes[target[73]]}
Predicted label of sample: {prediction_test[0]}, {classes[prediction_test[0]]}
""")

#True negatives: correctly predicted negatives (actual y=0, predicted = 0)
#True positives: correctly predicted positives (actual y=1, predicted = 1)
#False negatives: incorrectly predicted negatives (actual y=1, predicted = 0)
#False positives: incorrectly predicted positives (actual y=0, predicted = 1)

Actual label of sample: 0, benign
Predicted label of sample: 0, benign

Actual label of sample: 1, malignant
Predicted label of sample: 1, malignant

Actual label of sample: 1, malignant
Predicted label of sample: 0, benign



In [562]:
score = log_reg.score(data, target) # Score, or accuracy of the classifier
print(f"{math.ceil(score * 100)}%")

99%
