<a href="https://colab.research.google.com/github/RicardoLeonCo/myrepo/blob/main/Breast_Cancer_Detection_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Importing Libraries and Dataset.

from sklearn.datasets import load_breast_cancer #This is the dataset from which I am getting the information.
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score #I assume this is how I'm going to check my score on the outcomes.

In [2]:
#Create variables that we'll use later on.

data = load_breast_cancer() #WHY do we use parenthesis at the end of a line of code?
label_names = data["target_names"]
labels = data["target"]
feature_names = data["feature_names"]
features = data["data"]

In [4]:
#Let's do some EXPLORATORY DATA ANALYSIS

#We want to better understand our dataset, so we can
#take a look at it by printing our labels.

print(label_names) #These are the categories or labels.
print("Class label :", labels[0]) #We want to print using this format
#so it reads "Class label:" and then we are adding up the label for the first
#data instance or observation, the order starts at index 0 not 1. Since the result
#Is 0 I assume it might be malignant because that's the first category.
print(feature_names) #This is the column names.
print(features[0], "\n") #We are printing the observations or value points
#on each column for the first row of data.

['malignant' 'benign']
Class label : 0
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01] 



In [5]:
#To build or model, which I think it's a classifier (correct), we will use the Naive Bayes algorithm 
#to perform the breast cancer detection.

#It's best practice, to evaluate the performance of our model
#classifier by testing the model on invisible data so we divide
#our dataset into two parts: 80% training set and 20% is a test set.

train, test, train_labels, test_labels = train_test_split(features, labels,
                                                          test_size=0.2,
                                                          random_state=42) #What does this mean?

In [6]:
#There are multiple models within machine learning. Since we
#only need our classifier to tell us if this is a benign or malign
#and there's only two categories this is a binary classification
#problem and we use the Niave Bayes algorithm which works well with binary
#classification.

gnb = GaussianNB() #Stands for Gaussian Naive Bayes
gnb.fit(train, train_labels) #We are going to feed it our training data.

GaussianNB()

In [7]:
#we'll use the trained model to make predictions on the test set.
#But WHAT for?

preds = gnb.predict(test) #I'm passing my test data subset into
#the predict function.
print(preds, "\n") #The model's predictions. I infer that 0 = Malignant and 1 = Benign

[1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 1 1 0
 1 1 0] 



In [8]:
#Now we can assess how accurate our model was by comparing the two
#arrays, the test vs. the predictions using the true class labels,
#which I understand as the actual labels from the test subset.

print(accuracy_score(test_labels, preds)) #We'll print the accuracy score using
#its respective function as provided by scikit-learn library to determine
#how accurate was our machine learning classifier.

#What conclusion can we draw from this accuracy rate?
#The breast cancer detection model is able to make the
#correct prediction 97% of the time.

0.9736842105263158
