## Import libraries

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

## Loading for Exploratory Data Analysis

In [2]:
data = pd.read_csv("Dataset/cleaned_breast_cancer_dataset.csv")
## 
df = data.copy()

## Exploratory Data Analysis

In [3]:
## target variable 
df.diagnosis_result.value_counts()

diagnosis_result
0    120
1     93
Name: count, dtype: int64

In [4]:
numeric_cols = df.select_dtypes(exclude=[object])

corr_matrix = numeric_cols.corr()

corr_matrix['diagnosis_result']

Unnamed: 0         -0.009545
year               -0.022224
age                 0.531123
menopause          -0.381529
tumor_size          0.688803
inv_nodes           0.771358
metastasis          0.740473
history             0.192847
diagnosis_result    1.000000
Name: diagnosis_result, dtype: float64

## Build a Validation Framework


In [5]:
## 
df_train_full , df_test = train_test_split(df, test_size=0.2, random_state=11) 
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Training dataset: {len(df_train)}')
print(f'Validation dataset: {len(df_valid)}')
print(f'Test dataset: {len(df_test)}')

Training dataset: 127
Validation dataset: 43
Test dataset: 43


In [6]:
y_train = df_train['diagnosis_result'].values
y_valid = df_valid['diagnosis_result'].values
y_test = df_test['diagnosis_result'].values

In [7]:
del df_train['diagnosis_result']
del df_valid['diagnosis_result']
del df_test['diagnosis_result']

## Feature Engineering 
- Dividing our data into numerical and categorical
- perform the one-hot encoding

In [8]:
numerical_features = ['year','age','menopause','tumor_size','inv_nodes','metastasis','history']

categorical_features = ['breast','breast_quadrant']

In [9]:
## convert the dataframe into dict
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')

valid_dict = df_valid[categorical_features + numerical_features].to_dict(orient='records')

In [10]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [11]:
X_train = dv.transform(train_dict)

X_valid = dv.transform(valid_dict)

## Training The Model

In [12]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [13]:
y_valid_pred = model.predict_proba(X_valid)

In [14]:
y_valid_pred

array([[9.51074190e-03, 9.90489258e-01],
       [8.13147008e-01, 1.86852992e-01],
       [6.08220679e-02, 9.39177932e-01],
       [1.46229123e-01, 8.53770877e-01],
       [9.72776088e-01, 2.72239119e-02],
       [8.80035984e-01, 1.19964016e-01],
       [9.11277010e-01, 8.87229904e-02],
       [7.43061454e-01, 2.56938546e-01],
       [1.23412529e-02, 9.87658747e-01],
       [9.94639068e-01, 5.36093181e-03],
       [9.00026701e-03, 9.90999733e-01],
       [9.94050122e-01, 5.94987846e-03],
       [9.97991654e-01, 2.00834630e-03],
       [9.84734047e-01, 1.52659534e-02],
       [9.60066825e-02, 9.03993318e-01],
       [9.41219116e-01, 5.87808841e-02],
       [3.41305070e-04, 9.99658695e-01],
       [2.21240657e-03, 9.97787593e-01],
       [9.83932062e-01, 1.60679380e-02],
       [8.68623984e-01, 1.31376016e-01],
       [2.00355529e-02, 9.79964447e-01],
       [5.59374279e-01, 4.40625721e-01],
       [9.40408139e-01, 5.95918608e-02],
       [9.79458471e-03, 9.90205415e-01],
       [5.169574

In [15]:
y_valid_pred = model.predict_proba(X_valid)[:, 1]

In [16]:
diagnosis_result = y_valid_pred >= 0.5

In [17]:
(y_valid == diagnosis_result).mean()

0.8837209302325582

In [18]:
acc_score = accuracy_score(y_valid, diagnosis_result)
print(f'Validation Accuracy Score: {round(acc_score * 100, 1)}%')

Validation Accuracy Score: 88.4%


## Saving the model

In [19]:
## specifyging where to save the file
with open('breast-cancer-model.bin', 'wb') as f_out:
    ## save the model
    pickle.dump((dv,model), f_out)