# Breast cancer prediction model

Simple predictive model using xgboost along with the breast cancer data (imported from `scikit-learn`)

In [None]:
# loads the needed libraries 

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Load the required data set

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# load data
cancer = load_breast_cancer()

In [None]:
print(cancer.DESCR)

In [None]:
# splitting into training and test

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

In [None]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

print(f"Accuracy on training set: {gbrt.score(X_train, y_train)}")
print(f"Accuracy on test set: {gbrt.score(X_test, y_test)}")

It is always a good idea to use some sort of evaluation metrics... in this case purely as an example

In [None]:
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error

In [None]:
predictions = gbrt.predict(X_test)

print(f'R^2 score: {r2_score(y_true=y_test, y_pred=predictions):.2f}')
print(f'MAE score: {mean_absolute_error(y_true=y_test, y_pred=predictions):.2f}')
print(f'EVS score: {explained_variance_score(y_true=y_test, y_pred=predictions):.2f}')

Now we are going to serialize our train model. We are going to use this to serve it through and API

In [None]:
from pathlib import Path
import pickle

target_path = Path.cwd().parent.joinpath('webapp/model/cancer_model_xgboost.pkl') 

with open(target_path, 'wb') as pickle_target:
    pickle.dump(gbrt, pickle_target)

✨ Ta da! You now have a simple model that can be used in the rest of the tutorial