In [1]:
# Dataset link => https://datahub.io/machine-learning/breast-w#resource-breast-w_zip
# import the librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # To split the dataset
from matplotlib.colors import ListedColormap # Colormap object generated from a list of colors
#from sklearn.preprocessing import Imputer # Imputation transformer for completing missing values
from sklearn.impute import SimpleImputer

In [2]:
# Load the dataset
data = pd.read_csv('breast-w_csv.csv')

In [3]:
# separate the dependent variable from the independent variables
X = data.iloc[:,-10:-1].values
target = data['Class'].values

In [4]:
# Columns that have null values
data.isna().any()

Clump_Thickness          False
Cell_Size_Uniformity     False
Cell_Shape_Uniformity    False
Marginal_Adhesion        False
Single_Epi_Cell_Size     False
Bare_Nuclei               True
Bland_Chromatin          False
Normal_Nucleoli          False
Mitoses                  False
Class                    False
dtype: bool

In [5]:
# infer the missing values from the known part of the data
imput = SimpleImputer(
    missing_values=np.nan, 
    strategy='mean' # replace missing values using the mean along each column
)
# imput.fit(X[:,[4,6]])
imput.fit(X[:,5:6])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [6]:
# imput all the missing values
X[:, 5:6] = imput.transform(X[:,5:6])

In [7]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
target = lb_make.fit_transform(data['Class'].values)

In [8]:
# split the dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = 0.3, random_state = 42, stratify = target)

In [18]:
# we call LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [20]:
# Number of components n_classes - 1
lda = LDA(n_components = 1)

In [21]:
# We have 9 dimensionality in the X_train and X_test data
# Apply our PCA to the X_train and X_test data to reduce the dimentionality
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

In [22]:
# Size of the variance explained by each of the components.
explain_variance = lda.explained_variance_ratio_

In [23]:
print(explain_variance)

[1.]


In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)

In [25]:
# fit the classifier to data
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [27]:
print(knn_classifier.score(X_train, y_train))

0.9856850715746421


In [28]:
from sklearn.metrics import confusion_matrix

y_pred = knn_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[133   5]
 [  6  66]]
