## **``Dimensionality Reduction Techniques using Breast Cancer Dataset``**

In [1]:
import os
import sys
import logging

logging.basicConfig(filename="SA1_PCA.log",
                    filemode='w',
                    level=logging.INFO,
                    format="%(asctime)s : %(levelname)s : %(message)s")

try :
    logging.info("#### Packages import ####")
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import sklearn
    from sklearn import datasets
except ImportError as ie:
    # Output expected ImportErrors
    logging.error(msg=ie.__class__.__name__  + " :: Missing Package --> " + ie.name)
except Exception as exception:
    # Output unexpected Exceptions
    logging.info("#### Exceptions other than ModuleImportError ####")
    logging.log(msg=(exception, False))
    logging.log(msg=exception.__class__.__name__ + " :: " + exception.name)

%matplotlib inline

In [2]:
breast_cancer = datasets.load_breast_cancer()

In [3]:
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
cancer_df = pd.concat([pd.DataFrame(breast_cancer.data,columns=breast_cancer.feature_names),
                       pd.DataFrame(breast_cancer.target,columns=['Label'])],axis=1)

cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


### **``1. SVD (Singular Value Decomposition)``**

### **``Using Numpy``**

In [9]:
X = pd.DataFrame({'col1':[9,4,7,4],
                  'col2':[3,2,1,2]})

In [10]:
X

Unnamed: 0,col1,col2
0,9,3
1,4,2
2,7,1
3,4,2


In [105]:
X.shape, X.ndim

((4, 2), 2)

In [79]:
from numpy.linalg import svd

In [80]:
U,S,VT = svd(X,full_matrices=True,compute_uv=True,hermitian=False)

In [81]:
pd.DataFrame(U)

Unnamed: 0,0,1,2,3
0,-0.711633,-0.113179,-0.642945,-0.259597
1,-0.331229,-0.466058,0.650505,-0.49992
2,-0.523597,0.743485,0.385767,0.155758
3,-0.331229,-0.466058,0.121029,0.811436


In [106]:
Sigma = np.zeros((X.shape[0],X.shape[1]))
Sigma

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [107]:
Sigma[:X.shape[1],:X.shape[1]] = np.diag(S)
Sigma

array([[13.32885697,  0.        ],
       [ 0.        ,  1.53021959],
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ]])

In [83]:
pd.DataFrame(VT)

Unnamed: 0,0,1
0,-0.954298,-0.298856
1,0.298856,-0.954298


#### **Way-1 : Using Dot Product**

In [103]:
np.dot(U,np.dot(Sigma,VT))

array([[9., 3.],
       [4., 2.],
       [7., 1.],
       [4., 2.]])

#### **Way-2 : Using Matrix Multiplication**

In [104]:
U @ Sigma @ VT

array([[9., 3.],
       [4., 2.],
       [7., 1.],
       [4., 2.]])

### **``Using Sklearn : Truncated SVD``**

In [113]:
from sklearn.decomposition import TruncatedSVD

In [114]:
tsvd = TruncatedSVD(n_components=2)

In [115]:
X2 = pd.DataFrame({'col1':[9,4,7,4],
                   'col2':[3,2,1,2],
                   'col3':[5,6,7,1]})

In [116]:
X_transf = tsvd.fit_transform(X2)

In [118]:
tsvd.explained_variance_

array([6.42468871, 3.2616684 ])

In [119]:
tsvd.explained_variance_ratio_

array([0.63064429, 0.32016377])

In [120]:
tsvd.n_components

2

In [121]:
tsvd.components_

array([[ 0.75687368,  0.23209938,  0.61095999],
       [ 0.53103415,  0.32653333, -0.7819071 ]])

In [122]:
tsvd.singular_values_

array([16.60761964,  3.62068869])

In [124]:
X_transf

array([[10.56296122,  1.84937181],
       [ 7.15745341, -1.91423937],
       [ 9.80693507, -1.42957735],
       [ 4.10265348,  1.99529615]])

In [125]:
X2

Unnamed: 0,col1,col2,col3
0,9,3,5
1,4,2,6
2,7,1,7
3,4,2,1


### **Reference Links**

https://machinelearningmastery.com/singular-value-decomposition-for-machine-learning/

https://www.analyticsvidhya.com/blog/2019/08/5-applications-singular-value-decomposition-svd-data-science/

https://www.youtube.com/watch?v=46Hpy4FiGls&list=PLMrJAkhIeNNSVjnsviglFoY2nXildDCcv&index=10