# Import needed libraries

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LR
import numpy as np

# Load sklearn breast cancer dataset

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Or

# Dataset = loader(return_X_y=True, as_frame=True)
# X=Dataset[0]
# y=Dataset[1]

# print(X)
# print(y)

**Note: When the return_X_y parameter of the dataset loader is set to True, a tuple is returned which can be unpacked into**
**the feature variables (X) and target variable (y)**
**Setting as_frame to True returns the feature variables as Pandas DataFrame**

In [3]:
dataset_as_dict = load_breast_cancer(return_X_y=False, as_frame=True)

# print(dataset_as_dict,"\n")
# print(dataset_as_dict.data)

**Note: When the return_X_y parameter of the dataset loader is set to False, a dictionary with the following keys: 'data', 'target',**
**'frame', 'target_names', 'DESCR', 'feature_names', 'filename' and 'data_module' is returned as attributes which can be used**
**to access the dictionary**
**Setting as_frame to True returns the feature variables as Pandas DataFrame**

# Data preprocessing

In [4]:
print(f"X has {X.shape[0]} rows and {X.shape[1]} columns. I.e., There are {X.shape[1]} feature variable of {X.shape[0]} rows.\n")

print(X.isnull().any(),"\n") # or X.isna().any() 
print("The description of the dataset is given below")
X.describe()

X has 569 rows and 30 columns. I.e., There are 30 feature variable of 569 rows.

mean radius                False
mean texture               False
mean perimeter             False
mean area                  False
mean smoothness            False
mean compactness           False
mean concavity             False
mean concave points        False
mean symmetry              False
mean fractal dimension     False
radius error               False
texture error              False
perimeter error            False
area error                 False
smoothness error           False
compactness error          False
concavity error            False
concave points error       False
symmetry error             False
fractal dimension error    False
worst radius               False
worst texture              False
worst perimeter            False
worst area                 False
worst smoothness           False
worst compactness          False
worst concavity            False
worst concave points       F

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


# Dataset splitting and model training

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=123,)

X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()

lr = LR(max_iter=10000, C=0.1, verbose=2, n_jobs=-1)
model=lr.fit(X_train, y_train)

model.score(X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.3s finished


0.9766081871345029

# Model serialization (pickling)

In [6]:
# model serialization (pickling)
def pickle_model():
    import pickle
    import pathlib
    path=pathlib.Path("pickled_models/sklearn_breast_cancer_classifier.pkl")
    with open(path, mode="wb") as out_file:
        pickle.dump(model, out_file)

pickle_model()
# Note: the .pkl extension is note strictly necessary. However, it is considered good practice

# Model deserialization (unpickling)

In [7]:
# model deserialization (unpickling)
def unpickle_model():
    import pickle
    import pathlib
    path=pathlib.Path("pickled_models/sklearn_breast_cancer_classifier.pkl")
    with open(path, mode="rb") as in_file:
        unpickled_model = pickle.load(in_file)
    return unpickled_model
        
my_model = unpickle_model()
my_model

# Mapping the target (y) variable

In [8]:
def map_y():
    y = []
    for i in y_test:
        if i == 1:
            y.append("Malignant")
        else:
            y.append("Benign")
    return np.array(y)

y_test_label = map_y()

print(y_test_label) 

['Malignant' 'Malignant' 'Benign' 'Malignant' 'Benign' 'Malignant'
 'Malignant' 'Benign' 'Malignant' 'Malignant' 'Malignant' 'Benign'
 'Benign' 'Malignant' 'Benign' 'Malignant' 'Malignant' 'Malignant'
 'Malignant' 'Malignant' 'Benign' 'Benign' 'Malignant' 'Malignant'
 'Malignant' 'Benign' 'Benign' 'Malignant' 'Benign' 'Malignant' 'Benign'
 'Malignant' 'Malignant' 'Malignant' 'Benign' 'Malignant' 'Malignant'
 'Malignant' 'Malignant' 'Benign' 'Benign' 'Malignant' 'Benign'
 'Malignant' 'Benign' 'Malignant' 'Benign' 'Benign' 'Benign' 'Benign'
 'Benign' 'Benign' 'Malignant' 'Malignant' 'Malignant' 'Benign'
 'Malignant' 'Benign' 'Benign' 'Malignant' 'Benign' 'Malignant'
 'Malignant' 'Malignant' 'Malignant' 'Benign' 'Malignant' 'Malignant'
 'Malignant' 'Benign' 'Malignant' 'Malignant' 'Benign' 'Malignant'
 'Benign' 'Malignant' 'Malignant' 'Benign' 'Benign' 'Benign' 'Malignant'
 'Benign' 'Benign' 'Malignant' 'Malignant' 'Malignant' 'Benign'
 'Malignant' 'Benign' 'Malignant' 'Benign' 'Malignant

# Function to predict a class given the class index

In [9]:
print(f"Function to predict a Cancer class given the class index. Only integers between 1 and {len(X_test)} are allowed\n")

while True:
    try:
        num = int(input("Enter the Index of the class you want to predict: "))
        break
    except:
        print("\nPlease the input must be an integer!")

if num > len(X_test):
    print("\nThe index you entered is out of range. Click ctrl + ENTER to try again!")
else:
    num -= num # substract 1 to get the actual index since array index starts from 0
    def predict():
        reshaped_feat = np.array(X_test[num]).reshape(1, -1)
        prediction = my_model.predict(reshaped_feat)
        if prediction == 1:
            return "Malignant"
        else:
            return "Benign"

    def actual():
        actual = y_test_label[num]
        return actual

    predicted_class = predict()
    actual_class = actual()

    print(f"\nPredicted class: {predicted_class}\n")
    print(f"Actual class:    {actual_class}")

Function to predict a Cancer class given the class index. Only integers between 1 and 171 are allowed

Enter the Index of the class you want to predict: 78

Predicted class: Malignant

Actual class:    Malignant


# Function to predict the classes of cancer given the start and end indexes

In [10]:
print(f"Function to predict the classes of cancer given the start and end indexes. Only integers between 1 and {len(X_test)} are allowed\n")

while True:
    try:
        start = int(input("Start Index of the classes you want to predict: "))
        end = int(input("End Index of the classes you want to predict:   "))
        break
    except:
        print("\nPlease indicate the start and end indexes. Only integers are allowed\n")

if start > end:
    print("\nPlease the start index CANNOT be higher than the end index. Click ctrl + ENTER to start again!")   
else:
    start_index, end_index = start-1, end # substract 1 to get the actual index since array index starts from 0
    
    if start_index > len(X_test) or end_index > len(X_test):
        print("\nThe index you entered is out of range. Click ctrl + ENTER to try again!")
    else:
        def predict():
            prediction_list = []
            prediction = list(my_model.predict(X_test[start_index:end_index]))
            for i in prediction:
                if i == 1:
                    prediction_list.append("Malignant")
                else:
                    prediction_list.append("Benign")
            return prediction_list

        def actual():
            actual = list(y_test_label[start_index:end_index])
            return actual

        predicted_class = predict()
        actual_class = actual()

        print(f"\nPredicted classes: {predicted_class}")
        print(f"\nActual classes:    {actual_class}")

Function to predict the classes of cancer given the start and end indexes. Only integers between 1 and 171 are allowed

Start Index of the classes you want to predict: 8
End Index of the classes you want to predict:   8

Predicted classes: ['Malignant']

Actual classes:    ['Benign']
