<div style="font-size: 14pt;">Prof. Krzysztof Rybinski</div><br/><br/>
<div style="font-size: 22pt;"><b>Applied Machine Learning course</b></div><br/><br/>
<div style="font-size: 18pt;">LAB 7</div><br/>
<div style="font-size: 18pt;">- Principal Component Analysis (PCA)</div><br/>


In [None]:
#load necessary packages
from sklearn.decomposition import PCA
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.graphics.api as smg
from bioinfokit.visuz import cluster

In [None]:
#check dataset documentation https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html

'''
The tagged data set is from the "Breast Cancer Wisconsin (Diagnostic) Database" freely available in python's sklearn library, 
for details see:
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29

    Number of Samples: 569
    Number of Features: 30 numeric, predictive attributes
    Number of Classes: 2

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. 
They describe characteristics of the cell nuclei present in the image. 
Ten real-valued features are computed for each cell nucleus. 
The mean, standard error and 'worst' or largest (mean of the three largest values) of these features were computed 
for each image, resulting in 30 features. For instance, the radius measurements are for the 'mean radius', 
'standard error of the radius', and 'worst radius'. All feature values are recoded with four significant digits.

The two target classes correspond to negative outcomes (0 - Benign) and positive outcomes (1 - Malignant).
'''

In [None]:
breast = load_breast_cancer()
breast_data = breast.data
breast_data.shape

In [None]:
breast_labels = breast.target
breast_labels.shape

In [None]:
breast_labels

In [None]:
features = breast.feature_names
features

In [None]:
#convert to Pandas
breast_dataset = pd.DataFrame(breast_data)
breast_dataset.columns = features
pd.set_option('display.max_columns', None)
breast_dataset.head()

In [None]:
x = StandardScaler().fit_transform(breast_dataset) # normalizing the features
x.shape

In [None]:
np.mean(x),np.std(x)

In [None]:
#PCA
pca_breast = PCA(n_components=2)
principal_components_breast = pca_breast.fit_transform(x)
principal_breast_df = pd.DataFrame(data = principal_components_breast, \
                                   columns = ['principal_component_1', 'principal_component_2'])

In [None]:
principal_breast_df.tail()

In [None]:
pca_breast.explained_variance_ratio_

In [None]:
#visualize principal components
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("Principal Component Analysis of Breast Cancer Dataset",fontsize=20)
targets = [0,1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = breast_labels == target
    plt.scatter(principal_breast_df.loc[indicesToKeep, 'principal_component_1'],
                principal_breast_df.loc[indicesToKeep, 'principal_component_2'], c = color, s = 50)
plt.legend(targets,prop={'size': 15})

In [None]:
#choose number of principal components using scree plot (elbow plot)
n_components = 5
pca_breast = PCA(n_components=n_components)
principalComponents_breast = pca_breast.fit_transform(x)

PC_values = np.arange(pca_breast.n_components_) + 1
plt.plot(PC_values, pca_breast.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
#cummulative variance explained
cum_pca_ev = np.cumsum(pca_breast.explained_variance_ratio_)
plt.plot(PC_values, cum_pca_ev, 'o-', linewidth=2, color='blue')
plt.title('Cummulative Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Cummulative Variance Explained')
plt.show()

In [None]:
#biplot, showing relatoinship between PCs and initial features
# component loadings or weights (correlation coefficient between original variables and the component) 
# component loadings represents the elements of the eigenvector
# the squared loadings within the PCs always sums to 1
n_components = 30
pca_breast = PCA()
principalComponents_breast = pca_breast.fit_transform(x)
loadings = pca_breast.components_
loadings.shape

In [None]:
pc_list = ["PC"+str(i) for i in list(range(1, n_components+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df = loadings_df.set_index(features)
loadings_df

In [None]:
labels = features
labels

In [None]:
# get 2D biplot, it is saved in the current folder (Python)
#check documentation https://reneshbedre.github.io/blog/howtoinstall.html
cluster.biplot(cscore=principalComponents_breast, loadings=loadings, labels=labels, \
               var1=round(pca_breast.explained_variance_ratio_[0]*100, 2), \
               var2=round(pca_breast.explained_variance_ratio_[1]*100, 2), axlabelfontsize = 10)

# Compare model performance with and without PCA processing

In [None]:
breast_dataset.head()

In [None]:
breast_labels

In [None]:
corr = breast_dataset.corr()
smg.plot_corr(corr, xnames=corr.columns.values, normcolor=True)
plt.show()
#you should drop many correlated features and loose information

In [None]:
#train-test sets
x_train, x_test, y_train, y_test = train_test_split(
    breast_dataset, breast_labels, test_size=0.2, random_state=4, stratify=breast_labels)

In [None]:
#check proportions on 1s
y_train.mean(), y_test.mean()

In [None]:
x_train.mean()

# Logistic regression

In [None]:
#let's ignore multicollinearity in logit model
model = LogisticRegression(random_state=100, solver='liblinear')
model.fit(x_train, y_train)
model.coef_, model.intercept_

In [None]:
y_pred=model.predict(x_test)
cfm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
cfm = cfm / cfm.sum().sum()
cfm

In [None]:
accuracy = cfm.iloc[0,0] + cfm.iloc[1,1]
accuracy

# PCA + logistic regression

In [None]:
y_train.mean(), y_test.mean()

In [None]:
x_train_ss = StandardScaler().fit_transform(x_train) # normalizing the features in the training set
x.shape

In [None]:
x_test_ss = StandardScaler().fit_transform(x_test) # normalizing the features in the test set
x_train.shape, x_test.shape

In [None]:
pca_breast = PCA(n_components=2)
pca_train = pca_breast.fit_transform(x_train_ss)
pca_test = pca_breast.fit_transform(x_test_ss)
pca_train.shape, pca_test.shape

In [None]:
model = LogisticRegression(random_state=100, solver='liblinear')
model.fit(pca_train, y_train)
y_pred=model.predict(pca_test)
cfm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
cfm = cfm / cfm.sum().sum()
cfm

In [None]:
accuracy = cfm.iloc[0,0] + cfm.iloc[1,1]
accuracy