In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Logistic Regression

## Importing the libraries

In [2]:
import pandas as pd

## Importing the dataset

In [3]:
# insert your filepath
dataset = pd.read_csv('/breast_cancer.csv')

### Create two separate sets

In [4]:
# set of independent variables, a matrix of features
X = dataset.iloc[:, 1:-1].values

print(f'X has {X.shape[0]} rows and {X.shape[1]} columns.')
print()
print(X[:5])

X has 683 rows and 9 columns.

[[ 5  1  1  1  2  1  3  1  1]
 [ 5  4  4  5  7 10  3  2  1]
 [ 3  1  1  1  2  2  3  1  1]
 [ 6  8  8  1  3  4  3  7  1]
 [ 4  1  1  3  2  1  3  1  1]]


In [5]:
# set of dependent variable, one specific column/vector
y = dataset.iloc[:, -1].values

print(f'y has {y.shape[0]} elements.')
print(y[:5])

y has 683 elements.
[2 2 2 2 2]


## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0
                                                    )

print(f'X_train has {X_train.shape[0]} rows and {X_train.shape[1]} columns.')
print(f'X_test has {X_test.shape[0]} rows and {X_test.shape[1]} columns.')
print()
print(f'y_train has {y_train.shape[0]} elements.')
print(f'y_test has {y_test.shape[0]} elements.')

X_train has 546 rows and 9 columns.
X_test has 137 rows and 9 columns.

y_train has 546 elements.
y_test has 137 elements.


## Training the Logistic Regression model on the Training set

In [7]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Predicting the Test set results

In [8]:
y_pred = classifier.predict(X_test)
print(f'y_pred has {y_pred.shape[0]} elements.')
print(y_pred[:5])

y_pred has 137 elements.
[2 2 4 4 2]


## Making the Confusion Matrix

In [9]:
from sklearn.metrics import confusion_matrix

# y_test: vector of real results (ground truth)
# y_pred: vector of prediction

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[84  3]
 [ 3 47]]


- 84 correct predictions that a tumor is benign.
- 3 incorrect predictions that a tumor is benign, false positive.

---
- 3 incorrect predictions that a tumor is malignant, false negative.
- 47 correct predictions that a tumor is malignant.


## Computing the accuracy with k-Fold Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=classifier, 
                             X=X_train, 
                             y=y_train, 
                             cv=10
                             )

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 96.70 %
Standard Deviation: 1.97 %


So we can conclude that:
- Out of a 100 observations ~ 97 will be correct
- The 10 accuracies in our 10-fold Cross Validation fall between (96.70-1.97) and (96.70+1.97) ~ a range of 94.73 to 98.67