In [77]:
import pandas as pd
data = {
    "Country": ["Uzbekistan", "France", "India", "Kazakhstan", "South Korea", "Mongolia", "Myanmar", "Kyrgyzstan", "Germany", "Turkey"],
    "Income (GDP per capita)": [1768, 43518, 2085, 9812, 35000, 4568, 1407, 1281, 52563, 10616],
    "Freedom (Score)": [4.93, 8.22, 6.57, 6.21, 7.70, 6.17, 4.37, 5.53, 8.73, 5.36],
    "Life expectancy (years)": [71.6, 82.5, 70.8, 73.2, 83.5, 69.4, 67.3, 71.2, 81.2, 76.5],
    "Social support (Score)": [0.832, 0.892, 0.589, 0.867, 0.924, 0.842, 0.771, 0.792, 0.889, 0.802],
    "GDP (Billion USD)": [81.1, 3060, 3496, 240.5, 1800, 15.1, 62.4, 8.6, 4554, 905.6],
    "Public trust (Score)": [0.732, 0.536, 0.49, 0.641, 0.304, 0.414, 0.24, 0.622, 0.564, 0.41]
}
df1 = pd.DataFrame(data)
df1

Unnamed: 0,Country,Income (GDP per capita),Freedom (Score),Life expectancy (years),Social support (Score),GDP (Billion USD),Public trust (Score)
0,Uzbekistan,1768,4.93,71.6,0.832,81.1,0.732
1,France,43518,8.22,82.5,0.892,3060.0,0.536
2,India,2085,6.57,70.8,0.589,3496.0,0.49
3,Kazakhstan,9812,6.21,73.2,0.867,240.5,0.641
4,South Korea,35000,7.7,83.5,0.924,1800.0,0.304
5,Mongolia,4568,6.17,69.4,0.842,15.1,0.414
6,Myanmar,1407,4.37,67.3,0.771,62.4,0.24
7,Kyrgyzstan,1281,5.53,71.2,0.792,8.6,0.622
8,Germany,52563,8.73,81.2,0.889,4554.0,0.564
9,Turkey,10616,5.36,76.5,0.802,905.6,0.41


In [68]:
import numpy as np
import matplotlib.pyplot as plt


### Representing the Data
# data has shape (n, d)
data = np.array([
    [1768, 4.93, 71.6, 0.832, 81.1, 0.75, -45.2],
    [43518, 8.22, 82.5, 0.892, 3060, 0.65, -52.5],
    [2085, 6.57, 70.8, 0.589, 3496, 0.55, -65.0],
    [9812, 6.21, 73.2, 0.867, 240.5, 0.60, -43.3],
    [35000, 7.70, 83.5, 0.924, 1800, 0.85, -30.2],
    [4568, 6.17, 69.4, 0.842, 15.1, 0.50, -40.0],
    [1407, 4.37, 67.3, 0.771, 62.4, 0.30, -75.4],
    [1281, 5.53, 71.2, 0.792, 8.6, 0.55, -50.0],
    [52563, 8.73, 81.2, 0.889, 4554, 0.90, -25.5],
    [10616, 5.36, 76.5, 0.802, 905.6, 0.45, -48.6]
])


### Step 1: Standardize the Data along the Features
standardized_data = (data - data.mean(axis = 0)) / data.std(axis = 0) 


### Step 2: Calculate the Covariance Matrix
# use `ddof = 1` if using sample data (default assumption) and use `ddof = 0` if using population data
covariance_matrix = np.cov(standardized_data, ddof = 1, rowvar = False)


### Step 3: Eigendecomposition on the Covariance Matrix
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)


### Step 4: Sort the Principal Components
# np.argsort can only provide lowest to highest; use [::-1] to reverse the list
order_of_importance = np.argsort(eigenvalues)[::-1] 

# utilize the sort order to sort eigenvalues and eigenvectors
sorted_eigenvalues = eigenvalues[order_of_importance]
sorted_eigenvectors = eigenvectors[:,order_of_importance] # sort the columns


### Step 5: Calculate the Explained Variance
# use sorted_eigenvalues to ensure the explained variances correspond to the eigenvectors
explained_variance = (sorted_eigenvalues / np.sum(sorted_eigenvalues))*100


### Step 6: Reduce the Data via the Principal Components
k = 7 # select the number of principal components
reduced_data = np.matmul(standardized_data, sorted_eigenvectors[:,:k]) # transform the original data


### Step 7: Determine the Explained Variance
total_explained_variance = sum(explained_variance[:k])


### Potential Next Steps: Iterate on the Number of Principal Components
x = ['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5', 'PCA6', 'PCA7']
df = pd.DataFrame(reduced_data, columns = x )
df1 = pd.DataFrame(explained_variance, index = x )
df1.T

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7
0,69.144441,18.689409,7.296507,2.357482,1.612843,0.869944,0.029375


In [53]:
np.matrix(standardized_data)

matrix([[-0.77715483, -1.0606042 , -0.56407818,  0.13377336, -0.82825072,
          0.80295507,  0.16844809],
        [ 1.46147231,  1.34753094,  1.40657957,  0.80264017,  1.01131152,
          0.22941573, -0.35040046],
        [-0.76015735,  0.13980359, -0.70871361, -2.57513722,  1.28055491,
         -0.3441236 , -1.23883976],
        [-0.34583706, -0.12370056, -0.27480732,  0.52394567, -0.72981632,
         -0.05735393,  0.30349087],
        [ 1.00473876,  0.96691383,  1.58737386,  1.15936914,  0.23322282,
          1.3764944 ,  1.23457525],
        [-0.62701936, -0.1529788 , -0.96182562,  0.24525116, -0.86900774,
         -0.63089327,  0.53803884],
        [-0.79651158, -1.47049954, -1.34149363, -0.54624123, -0.83979854,
         -1.77797194, -1.97802126],
        [-0.80326768, -0.62143062, -0.6363959 , -0.31213785, -0.87302169,
         -0.3441236 , -0.1727126 ],
        [ 1.94646351,  1.72082848,  1.17154699,  0.76919683,  1.93390241,
          1.66326407,  1.56862843],
        [-

In [70]:
np.matrix(covariance_matrix)

matrix([[ 1.11111111,  1.00481382,  1.0098298 ,  0.67780508,  0.82343542,
          0.77546329,  0.65405555],
        [ 1.00481382,  1.11111111,  0.91719144,  0.4521009 ,  0.91248148,
          0.80210845,  0.66886693],
        [ 1.0098298 ,  0.91719144,  1.11111111,  0.67855891,  0.69265398,
          0.7976268 ,  0.69183437],
        [ 0.67780508,  0.4521009 ,  0.67855891,  1.11111111, -0.07099821,
          0.57010406,  0.77882526],
        [ 0.82343542,  0.91248148,  0.69265398, -0.07099821,  1.11111111,
          0.59337906,  0.26527231],
        [ 0.77546329,  0.80210845,  0.7976268 ,  0.57010406,  0.59337906,
          1.11111111,  0.89736013],
        [ 0.65405555,  0.66886693,  0.69183437,  0.77882526,  0.26527231,
          0.89736013,  1.11111111]])

In [72]:
print("EigenValues: ", eigenvalues, "\n")
print("EigenVectors: ", np.matrix(eigenvectors))

EigenValues:  [5.37790095e+00 1.45362066e+00 5.67506138e-01 2.28468342e-03
 6.76622822e-02 1.25443347e-01 1.83359718e-01] 

EigenVectors:  [[ 0.43045391  0.11190799 -0.36634284  0.67529182  0.36336397 -0.2744257
   0.06794906]
 [ 0.41885135  0.24683987 -0.03039963  0.01170453 -0.72772038 -0.01708644
   0.48239361]
 [ 0.41998189  0.02250117 -0.31623422 -0.06881425 -0.0965886   0.66496217
  -0.51659015]
 [ 0.29159626 -0.60602178 -0.42455375 -0.48212293  0.02445533 -0.36065224
   0.06597512]
 [ 0.31263646  0.62384634  0.07933257 -0.54719648  0.43395672 -0.1136618
   0.07822468]
 [ 0.39257624 -0.09435204  0.58191572  0.06610183 -0.17947731 -0.40889235
  -0.54275331]
 [ 0.35496782 -0.40086334  0.48959091  0.05359664  0.32852961  0.41477934
   0.43676192]]


In [80]:
df3 = df[['PCA1']]
df3.index = df1['Country']
df3.columns = ['Happiness']
df3

Unnamed: 0_level_0,Happiness
Country,Unnamed: 1_level_1
Uzbekistan,-0.860586
France,2.300152
India,-1.491698
Kazakhstan,-0.306266
South Korea,2.89375
Mongolia,-0.994784
Myanmar,-3.344142
Kyrgyzstan,-1.43369
Germany,4.089337
Turkey,-0.852072
