# Logistic Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [20]:
dataset = pd.read_csv('breast-cancer-wisconsin.data')
dataset.replace('?', np.nan, inplace=True)
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [22]:
print(X, type(X))

[[5 4 4 ... 3 2 1]
 [3 1 1 ... 3 1 1]
 [6 8 8 ... 3 7 1]
 ...
 [5 10 10 ... 8 10 2]
 [4 8 6 ... 10 6 1]
 [4 8 8 ... 10 4 1]] <class 'numpy.ndarray'>


In [16]:
y

array([2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 4, 2,
       4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4, 4,
       2, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 2,
       4, 4, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2,
       4, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2, 4,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2, 2,
       4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4, 2,
       4, 2, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2,
       2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 4, 2, 2,
       4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2, 2,
       2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4, 4,
       2, 4, 4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 4, 4,
       2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 4, 2,

Taking care of missing data


In [23]:
print(type(X), type(y), type(dataset), X.dtype, y.dtype)
#isnan(ndarray) fails on ndarray dtype of "object"
# Check for NaN values
nan_mask = pd.isna(X)

# Check if any NaN values exist
any_nan_values = np.any(nan_mask)

# Display the results
print("Original array:")
print(X)
print("\nNaN mask:")
print(nan_mask)
print("\nAny NaN values exist:", any_nan_values)


<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'pandas.core.frame.DataFrame'> object int64
Original array:
[[5 4 4 ... 3 2 1]
 [3 1 1 ... 3 1 1]
 [6 8 8 ... 3 7 1]
 ...
 [5 10 10 ... 8 10 2]
 [4 8 6 ... 10 6 1]
 [4 8 8 ... 10 4 1]]

NaN mask:
[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]

Any NaN values exist: True


In [24]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X = imputer.transform(X)

## Splitting the dataset into the Training set and Test set

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature Scaling

In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## Training the Logistic Regression model on the Training set

In [27]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

## Predicting a new result

In [28]:
print(classifier.predict(sc.transform([[8, 8, 8, 8, 8, 8, 8, 8, 8]])))

[4]


## Predicting the Test set results

In [29]:
y_pred = classifier.predict(x_test)

## Making the Confusion Matrix

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[105   3]
 [  3  64]]
0.9657142857142857


Computing the accurracy with k-fold Cross Validation

In [34]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(classifier, X = x_train, y = y_train, cv = 10)
print(f"Accuracy: {accuracies.mean(): 0.2f} %")
print(f"Standard deviation: {accuracies.std() * 100: 0.2f} %")

Accuracy:  0.97 %
Standard deviation:  1.16 %
