In [145]:
## Due to Coursera's plagriasm policy, I have used a different dataset/method for this demo

In [72]:
# Import the packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

In [59]:
# Use the ggplot style
plt.style.use("ggplot")

In [60]:
# load the data
data = load_breast_cancer()

In [None]:
# Create a DataFrame from the data
df = pd.DataFrame(data.data, columns=data.feature_names)

In [96]:
df["target"] = data.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [97]:
# view the first five rows of the data
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [98]:
# view the dims of the data
print(df.shape)
# There are 569 rows and 31 columns in the dataFrame; 30 columns for the data, and one column for the target

(569, 31)


In [99]:
# check the column names
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [100]:
# drop all the missing data
df.dropna(how="all", inplace=True)
# check the data again to see if any rows have been dropped
df.shape
# Apparently there's no empty row in the data

(569, 31)

In [101]:
# Get more information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [102]:
# # Convert the DataFrame to a NumPy array first to avoid errors
scaler = StandardScaler()
x = df.iloc[:, 0:31].values
scaler.fit(x)
x_scaled = scaler.transform(x)

In [104]:
def eig_val_vecs(S):
    # Compute the eigenvalues and eigenvectors
    eigvals, eigvecs = np.linalg.eig(S)
#     sort in descending order
    sort_indices = np.argsort(eigvals)[::-1]
    return eigvals[sort_indices], eigvecs[:, sort_indices]

In [105]:
# compute the covariance matrix
cov_matrix = np.cov(x_scaled.T)
cov_matrix

array([[ 1.00176056e+00,  3.24351929e-01,  9.99612069e-01,
         9.89095475e-01,  1.70881506e-01,  5.07014640e-01,
         6.77955036e-01,  8.23976636e-01,  1.48001350e-01,
        -3.12179472e-01,  6.80285970e-01, -9.74887767e-02,
         6.75358538e-01,  7.37159198e-01, -2.22992026e-01,
         2.06362656e-01,  1.94545531e-01,  3.76831225e-01,
        -1.04504545e-01, -4.27163418e-02,  9.71245907e-01,
         2.97530545e-01,  9.66835698e-01,  9.42739295e-01,
         1.19826732e-01,  4.14190751e-01,  5.27839123e-01,
         7.45524434e-01,  1.64241985e-01,  7.07832563e-03,
        -7.31313773e-01],
       [ 3.24351929e-01,  1.00176056e+00,  3.30113223e-01,
         3.21650988e-01, -2.34296930e-02,  2.37118951e-01,
         3.02950254e-01,  2.93980713e-01,  7.15266864e-02,
        -7.65717560e-02,  2.76354360e-01,  3.87037830e-01,
         2.82169018e-01,  2.60302460e-01,  6.62542133e-03,
         1.92312595e-01,  1.43545353e-01,  1.64139495e-01,
         9.14323671e-03,  5.45

In [106]:
# compute the eigen values and eigen vectors
eig_vals, eig_vecs = eig_val_vecs(cov_matrix)

# confirm they are being sorted by the magnitude of their eigen values
eig_vals

array([1.39422741e+01, 5.73643378e+00, 2.85186154e+00, 2.00188139e+00,
       1.66199949e+00, 1.20963379e+00, 6.85924646e-01, 4.85709057e-01,
       4.17628800e-01, 3.51511408e-01, 3.05392113e-01, 2.91498144e-01,
       2.61469947e-01, 2.12649275e-01, 1.48837762e-01, 8.83177777e-02,
       8.00007715e-02, 5.91392660e-02, 5.15525101e-02, 4.83008359e-02,
       3.07603392e-02, 2.95786345e-02, 2.73928094e-02, 2.43263453e-02,
       1.75831161e-02, 1.55005796e-02, 8.07032705e-03, 6.88182652e-03,
       1.59126388e-03, 7.42984526e-04, 1.32868212e-04])

In [109]:
# Find the explained variance and select the eigen vectors that explain up to 95% of the data
sum_var = sum(eig_vals)
# print(sum_var)
exp_var = [(i/sum_var)*100 for i in eig_vals]
exp_var

cum_var =  np.cumsum(exp_var)

cum_var
# We see that the first 11 principal components accounts for over 95.48% of the variance in the data

array([ 44.89603531,  63.36813909,  72.55152452,  78.99785725,
        84.34972363,  88.2449104 ,  90.45368186,  92.01773166,
        93.3625537 ,  94.49446866,  95.47787316,  96.41653717,
        97.25850628,  97.94326612,  98.42254413,  98.70693949,
        98.96455295,  99.1549895 ,  99.32099566,  99.47653097,
        99.57558349,  99.67083075,  99.75903935,  99.83737352,
        99.89399357,  99.94390756,  99.96989512,  99.99205555,
        99.99717963,  99.99957215, 100.        ])

In [110]:
# This means that instead of working with data that has 30 dims, 
# we can work with data that has 11 dims with 95.48% of the variance in the dataset accounted for

In [147]:
# For ease, choose the dot product as the inner product
top_eig_vecs = eig_vecs[:11]  # list of eigenvectors

# # Construct the projection matrix
proj_matrix = np.column_stack(top_eig_vecs).T
proj_matrix

array([[-2.16399142e-01, -2.26544730e-01,  2.52701218e-03,
        -5.04480406e-02,  4.17514426e-02, -1.92387685e-02,
         1.13375192e-01,  2.64950105e-02, -2.23138417e-01,
         9.94528633e-02,  3.53081074e-02,  6.58312316e-02,
        -4.69301068e-02,  3.17640848e-02, -4.91407421e-02,
        -6.02915616e-02, -1.49679942e-01,  1.97432246e-01,
        -2.37827952e-01,  1.32340264e-01, -1.57824556e-02,
        -7.71942331e-02, -7.12837755e-02, -9.46712145e-02,
        -1.84560235e-01,  2.42793473e-02, -1.30571296e-01,
         1.41750577e-01, -2.12824112e-01,  2.06961991e-01,
         7.02475498e-01],
       [-1.03599359e-01, -5.82619564e-02,  5.47507650e-02,
         5.99536672e-01, -2.04475783e-02,  3.00177272e-02,
        -2.75020212e-02, -1.00374070e-01,  1.12613548e-01,
         2.53797730e-01,  3.12407489e-01, -1.64216449e-01,
        -2.36735325e-01, -1.58721096e-01, -4.39136893e-03,
        -1.07723525e-01, -1.56678466e-01, -3.93985464e-02,
         3.96331930e-02,  6.17

In [152]:
reduced_dim_data = np.dot(proj_matrix,x.T)
reduced_dim_data

array([[-258.06898979, -271.56113471, -241.71391204, ..., -169.97104807,
        -261.14602488,  -47.18572944],
       [ 553.74860359,  766.80823227,  691.26923902, ...,  497.76844143,
         729.15903964,  104.30022612],
       [-146.32294073, -161.53847651, -146.20772912, ..., -106.47821442,
        -158.81929852,  -33.00494   ],
       ...,
       [-182.7270947 , -190.19568156, -171.78268438, ..., -114.01808862,
        -178.82701744,  -24.50011774],
       [-284.19632942, -270.72589119, -247.25903466, ..., -169.30275741,
        -265.92567406,  -47.03505522],
       [-557.97273307, -572.24145225, -499.03318363, ..., -325.50977492,
        -526.46722001,  -67.0606255 ]])

In [149]:
reduced_dim_data.shape

(11, 569)