# **Machine Learning in Python: Performing Principal Component Analysis (PCA)**

Abdullah Al Shamim

Github, https://github.com/PrinceShamim022/Fisheries_DataAnalysis_Projects/tree/main/Clam </i>

In this Jupyter notebook, we will be performing Principal Component Analysis (PCA) using the clam data set as an example.

---

*italicized text*## **1. Clam data set**

### Load library

In [91]:
import pandas as pd
import numpy as np

### Load dataset

In [92]:
!git clone https://github.com/PrinceShamim022/Fisheries_DataAnalysis_Projects.git

fatal: destination path 'Fisheries_DataAnalysis_Projects' already exists and is not an empty directory.


In [93]:
df = pd.read_csv('/content/Fisheries_DataAnalysis_Projects/Clam/clam.csv')

In [94]:
df.head

<bound method NDFrame.head of       SL..NO.     Month Color Deform  Trt     SL     SH     SW     LL     PL  \
0           1   January    A1    NaN    1  41.47  33.00  22.63   8.01  34.06   
1           2   January    A2    NaN    1  39.85  31.55  21.40   9.94  31.82   
2           3   January    A3    NaN    1  39.48  33.25  22.92   9.83  32.86   
3           4   January    A4    NaN    1  41.09  35.11  22.71  10.52  34.16   
4           5   January    A5    NaN    1  35.13  29.42  20.18   9.51  27.69   
...       ...       ...   ...    ...  ...    ...    ...    ...    ...    ...   
2288     2396  December    J1    NaN    6  32.78  25.62  16.03   6.57  23.86   
2289     2397  December    K1    NaN    7  27.92  21.76  14.90   6.15  20.10   
2290     2398  December    K2      D    7  31.44  26.66  16.67   7.43  23.89   
2291     2399  December    K3    NaN    7  31.15  23.43  13.99   5.95  21.66   
2292     2400  December    K4    NaN    7  30.50  24.70  14.01   5.75  20.05   

      ...

In [95]:
df.columns

Index(['SL..NO.', 'Month', 'Color', 'Deform', 'Trt', 'SL', 'SH', 'SW', 'LL',
       'PL', 'AL', 'UL', 'LCT', 'LPAS', 'PW', 'AW', 'PVM', 'PS',
       'CT..RIGHT.TEETH.', 'Total.wt', 'muscle.wt'],
      dtype='object')

In [96]:
features = ['SL', 'SH', 'SW', 'LL',
       'PL', 'AL', 'UL', 'LCT', 'LPAS', 'PW', 'AW', 'PVM', 'PS',
       'CT..RIGHT.TEETH.', 'Total.wt', 'muscle.wt']

In [97]:
df = df.dropna(subset = features)

In [98]:
data = df[features].copy()

In [99]:
data

Unnamed: 0,SL,SH,SW,LL,PL,AL,UL,LCT,LPAS,PW,AW,PVM,PS,CT..RIGHT.TEETH.,Total.wt,muscle.wt
0,41.47,33.00,22.63,8.01,34.06,24.97,32.25,24.30,24.41,7.83,4.47,7.29,7.68,5.18,24.50,5.72
1,39.85,31.55,21.40,9.94,31.82,24.01,29.02,18.62,21.62,6.71,4.77,8.15,7.11,6.45,22.37,4.60
2,39.48,33.25,22.92,9.83,32.86,25.63,29.87,17.27,22.61,6.17,5.01,6.76,8.26,7.38,22.22,4.09
3,41.09,35.11,22.71,10.52,34.16,26.59,32.35,15.56,22.86,7.61,4.47,8.50,6.53,6.52,24.40,4.17
4,35.13,29.42,20.18,9.51,27.69,22.84,28.06,15.21,18.84,7.51,4.42,7.22,7.00,6.00,20.03,2.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2288,32.78,25.62,16.03,6.57,23.86,17.69,23.68,14.46,20.70,4.76,3.20,6.03,4.64,2.21,18.32,1.59
2289,27.92,21.76,14.90,6.15,20.10,14.61,19.35,11.83,18.49,3.62,2.71,4.90,4.79,3.01,14.52,1.26
2290,31.44,26.66,16.67,7.43,23.89,17.60,24.43,13.16,19.43,5.29,4.08,6.32,5.17,4.29,17.54,1.89
2291,31.15,23.43,13.99,5.95,21.66,15.48,21.65,13.09,18.74,4.23,3.40,6.49,5.13,4.01,17.53,1.22


### Input features

In [100]:
print(data)

         SL     SH     SW     LL     PL     AL     UL    LCT   LPAS    PW  \
0     41.47  33.00  22.63   8.01  34.06  24.97  32.25  24.30  24.41  7.83   
1     39.85  31.55  21.40   9.94  31.82  24.01  29.02  18.62  21.62  6.71   
2     39.48  33.25  22.92   9.83  32.86  25.63  29.87  17.27  22.61  6.17   
3     41.09  35.11  22.71  10.52  34.16  26.59  32.35  15.56  22.86  7.61   
4     35.13  29.42  20.18   9.51  27.69  22.84  28.06  15.21  18.84  7.51   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...   ...   
2288  32.78  25.62  16.03   6.57  23.86  17.69  23.68  14.46  20.70  4.76   
2289  27.92  21.76  14.90   6.15  20.10  14.61  19.35  11.83  18.49  3.62   
2290  31.44  26.66  16.67   7.43  23.89  17.60  24.43  13.16  19.43  5.29   
2291  31.15  23.43  13.99   5.95  21.66  15.48  21.65  13.09  18.74  4.23   
2292  30.50  24.70  14.01   5.75  20.05  15.09  20.71  13.21  18.09  4.01   

        AW   PVM    PS  CT..RIGHT.TEETH.  Total.wt  muscle.wt  
0     4.47 

### Output features

In [101]:
print(df.Trt)

0       1
1       1
2       1
3       1
4       1
       ..
2288    6
2289    7
2290    7
2291    7
2292    7
Name: Trt, Length: 2293, dtype: int64


### Assigning Input (X) and Output (Y) variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [102]:
X = data
Y = df.Trt

In [103]:
X

Unnamed: 0,SL,SH,SW,LL,PL,AL,UL,LCT,LPAS,PW,AW,PVM,PS,CT..RIGHT.TEETH.,Total.wt,muscle.wt
0,41.47,33.00,22.63,8.01,34.06,24.97,32.25,24.30,24.41,7.83,4.47,7.29,7.68,5.18,24.50,5.72
1,39.85,31.55,21.40,9.94,31.82,24.01,29.02,18.62,21.62,6.71,4.77,8.15,7.11,6.45,22.37,4.60
2,39.48,33.25,22.92,9.83,32.86,25.63,29.87,17.27,22.61,6.17,5.01,6.76,8.26,7.38,22.22,4.09
3,41.09,35.11,22.71,10.52,34.16,26.59,32.35,15.56,22.86,7.61,4.47,8.50,6.53,6.52,24.40,4.17
4,35.13,29.42,20.18,9.51,27.69,22.84,28.06,15.21,18.84,7.51,4.42,7.22,7.00,6.00,20.03,2.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2288,32.78,25.62,16.03,6.57,23.86,17.69,23.68,14.46,20.70,4.76,3.20,6.03,4.64,2.21,18.32,1.59
2289,27.92,21.76,14.90,6.15,20.10,14.61,19.35,11.83,18.49,3.62,2.71,4.90,4.79,3.01,14.52,1.26
2290,31.44,26.66,16.67,7.43,23.89,17.60,24.43,13.16,19.43,5.29,4.08,6.32,5.17,4.29,17.54,1.89
2291,31.15,23.43,13.99,5.95,21.66,15.48,21.65,13.09,18.74,4.23,3.40,6.49,5.13,4.01,17.53,1.22


In [104]:
Y

0       1
1       1
2       1
3       1
4       1
       ..
2288    6
2289    7
2290    7
2291    7
2292    7
Name: Trt, Length: 2293, dtype: int64

### Let's examine the data dimension

In [105]:
X.shape

(2293, 16)

In [106]:
Y.shape

(2293,)

---

## **2. PCA analysis**

### 2.1. Load library

In [107]:
from sklearn.preprocessing import scale # Data scaling
from sklearn import decomposition #PCA

### 2.2. Data scaling

In [108]:
X = scale(X)

### 2.3. Perform PCA analysis

Here we define the number of PC to use as 3

In [109]:
pca = decomposition.PCA(n_components=7)
pca.fit(X)


#### 2.4. Compute and retrieve the **scores** values

In [110]:
scores = pca.transform(X)

In [111]:
scores_df = pd.DataFrame(scores, columns=['PC1', 'PC2', 'PC3','PC4', 'PC5', 'PC6', 'PC7'])   # make dataframe for scores
scores_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
0,-3.608297,0.027792,2.055425,-0.559572,-1.151521,0.449256,0.934942
1,-2.321065,-1.319160,1.602370,-0.170173,-0.281348,-0.164782,-0.261823
2,-2.634265,-2.277664,1.112976,-0.508562,-1.079184,0.819189,0.056572
3,-3.167489,-1.088887,0.956553,-0.345439,0.308728,-0.381656,-0.078490
4,-0.394343,-1.617757,0.717206,0.607583,-0.657228,-0.018757,-0.594896
...,...,...,...,...,...,...,...
2288,3.673399,1.490560,-0.348394,0.007197,-0.537082,-0.389799,0.239142
2289,5.999144,0.452085,-0.088217,-0.399148,-0.845529,0.019423,-0.209234
2290,2.925657,-0.261820,0.167502,0.600797,-0.269199,-0.071712,-0.186178
2291,4.479540,-0.314959,-0.267683,0.151667,-0.510798,-0.651645,0.085011


In [112]:
Y_label = []

for i in Y:
  if i == 1:
    Y_label.append('Green')
  elif i == 2:
    Y_label.append('Blue')
  elif i == 3:
    Y_label.append('Orange')
  elif i == 4:
    Y_label.append('Black')
  elif i == 5:
    Y_label.append('Purple')
  elif i == 6:
    Y_label.append('Red')
  else:
    Y_label.append('Violet')

Trt = pd.DataFrame(Y_label, columns=['Trt'])

In [113]:
df_scores = pd.concat([scores_df, Trt], axis=1)  # add scores column with species

#### 2.5. Retrieve the **loadings** values

In [114]:
loadings = pca.components_.T
df_loadings = pd.DataFrame(loadings, columns=['PC1', 'PC2', 'PC3','PC4', 'PC5', 'PC6', 'PC7'], index=data.columns)
df_loadings

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
SL,-0.275058,0.036524,-0.114732,-0.07623,0.072438,-0.026976,0.199764
SH,-0.275626,0.037361,-0.085574,-0.095969,0.062606,0.004256,0.060191
SW,-0.269675,0.03962,-0.074487,-0.09858,0.136915,-0.021656,0.008532
LL,-0.252634,0.143511,-0.014052,-0.075348,0.059656,0.196969,-0.804718
PL,-0.271299,-0.041332,-0.100876,-0.127307,0.16742,0.040322,0.108288
AL,-0.268009,0.041457,-0.028908,-0.014138,0.017579,0.015102,0.028016
UL,-0.273346,0.083997,-0.092721,-0.097326,0.066027,0.031293,0.071626
LCT,-0.255402,0.17861,-0.065008,-0.042099,0.051907,0.206377,-0.103139
LPAS,-0.249839,0.260535,-0.162631,-0.178339,-0.321992,0.182647,0.05016
PW,-0.256075,-0.021447,0.077369,0.429977,-0.071481,0.088781,-0.024091


#### 2.6. **Explained variance** for each PC

In [115]:
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.7888531 , 0.06206347, 0.03509975, 0.02195   , 0.01506714,
       0.01469385, 0.01285586])

## **3. Scree Plot**

### 3.1. Import library

In [116]:
import numpy as np
import plotly.express as px

### 3.2. Preparing explained variance and cumulative variance

#### 3.2.1. Preparing the explained variance data

In [117]:
explained_variance

array([0.7888531 , 0.06206347, 0.03509975, 0.02195   , 0.01506714,
       0.01469385, 0.01285586])

In [118]:
explained_variance = np.insert(explained_variance, 0, 0)

#### 3.2.2. Preparing the cumulative variance data

In [119]:
cumulative_variance = np.cumsum(np.round(explained_variance, decimals=3))

#### 3.2.3. Combining the dataframe

In [120]:
pc_df = pd.DataFrame(['','PC1', 'PC2', 'PC3','PC4', 'PC5', 'PC6', 'PC7'], columns=['PC'])
explained_variance_df = pd.DataFrame(explained_variance, columns=['Explained Variance'])
cumulative_variance_df = pd.DataFrame(cumulative_variance, columns=['Cumulative Variance'])

In [121]:
df_explained_variance = pd.concat([pc_df, explained_variance_df, cumulative_variance_df], axis=1)  # create dataframe by adding pc_df, explained_variance_df, cumulative_variance_df
df_explained_variance

Unnamed: 0,PC,Explained Variance,Cumulative Variance
0,,0.0,0.0
1,PC1,0.788853,0.789
2,PC2,0.062063,0.851
3,PC3,0.0351,0.886
4,PC4,0.02195,0.908
5,PC5,0.015067,0.923
6,PC6,0.014694,0.938
7,PC7,0.012856,0.951


#### 3.2.4. Making the scree plot

##### 3.2.4.1. Explained Variance

In [122]:
# https://plotly.com/python/bar-charts/

fig = px.bar(df_explained_variance,
             x='PC', y='Explained Variance',
             text='Explained Variance',
             width=800)

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')  # fig customization
fig.show()

##### 3.2.4.2. Explained Variance + Cumulative Variance

In [123]:
# https://plotly.com/python/creating-and-updating-figures/
# create 2 plots in a graph (bar+scatter)

import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_explained_variance['PC'],
        y=df_explained_variance['Cumulative Variance'],
        marker=dict(size=15, color="LightSeaGreen")
    ))

fig.add_trace(
    go.Bar(
        x=df_explained_variance['PC'],
        y=df_explained_variance['Explained Variance'],
        marker=dict(color="RoyalBlue")
    ))

fig.show()

##### 3.2.4.3. Explained Variance + Cumulative Variance (Separate Plot)

In [124]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(
        x=df_explained_variance['PC'],
        y=df_explained_variance['Cumulative Variance'],
        marker=dict(size=15, color="LightSeaGreen")
    ), row=1, col=1
    )

fig.add_trace(
    go.Bar(
        x=df_explained_variance['PC'],
        y=df_explained_variance['Explained Variance'],
        marker=dict(color="RoyalBlue"),
    ), row=1, col=2
    )

fig.show()

## **4. Scores Plot**

Source: https://plotly.com/python/3d-scatter-plots/

### 4.1. Load library
[API Documentation](https://plotly.com/python-api-reference/plotly.express.html) for *plotly.express* package

In [125]:
import plotly.express as px

### 4.2. Basic 3D Scatter Plot

In [126]:
fig = px.scatter_3d(df_scores, x='PC1', y='PC2', z='PC3',
              color='Trt')

fig.show()

### 4.3. Customized 3D Scatter Plot

In [128]:
fig = px.scatter_3d(df_scores, x='PC1', y='PC2', z='PC3',
              color='Trt',
              symbol='Trt',
              opacity=0.5)

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

# https://plotly.com/python/templates/
#fig.update_layout(template='plotly_white') # "plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"

## **5. Loadings Plot**

In [129]:
loadings_label = df_loadings.index
# loadings_label = df_loadings.index.str.strip(' (cm)')

fig = px.scatter_3d(df_loadings, x='PC1', y='PC2', z='PC3',
                    text = loadings_label)

fig.show()

---