1. Import data.

In [None]:
import pandas as pd

df = pd.read_csv('C:/Users/TheAncientOwl/Code/data-analysis-tool/server/test-data/pca-data.csv')

df.head()


2. Pick features & target.

In [None]:
target = 'Country'
features = list(df.columns)
features.remove(target)

print(f">> Target: '{target}'")
print(f'>> Features: {features}')


3. Standardize the data.

In [None]:
from sklearn.preprocessing import StandardScaler

x = df.loc[:, features].values
y = df.loc[:, target].values

x = StandardScaler().fit_transform(x)
features_df = pd.DataFrame(data=x, columns=features)
features_df.head()


4. Correlation matrix.

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = features_df.corr().round(2)

plt.figure(figsize=(8, 8))
cmap = sns.diverging_palette(250, 20, as_cmap=True)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, annot=True, 
            vmax=1, vmin=-1, center=0,
            cmap=cmap, mask=mask)
plt.show()


5. PCA.

In [None]:
def make_pca_labels(components_count):
  return [f'PC{x}' for x in range(1, components_count + 1)]

make_pca_labels(4)

In [None]:
from sklearn.decomposition import PCA

pca = PCA()

principal_components = pca.fit_transform(x)

principal_df = pd.DataFrame(data=principal_components, columns=make_pca_labels(pca.n_components_))
final_df = pd.concat([df[target], principal_df], axis=1)

final_df.head()

6. Pick principal components count.

6.1. *via* Kaiser.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

pc_values = np.arange(pca.n_components_) + 1
plt.plot(pc_values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.xticks(range(1, pca.n_components_ + 1))
plt.show()

6.2. *via* The threshold of variance explained approach *(at least 70%)*.

In [None]:
import numpy as np

explained_variance_ratio = pca.explained_variance_ratio_ * 100
out_sum = np.cumsum(explained_variance_ratio)

variance_ratio_df = pd.DataFrame(data={'Proportion of Variance Explained': list(explained_variance_ratio), 'Cumulative Proportion of Variance Explained': list(out_sum)}, 
                           columns=['Proportion of Variance Explained', 'Cumulative Proportion of Variance Explained'],
                           index=make_pca_labels(pca.n_components_))
variance_ratio_df

6.3. *via* Hold onto principal components whose eigenvalues are > 1.

In [None]:
variance_ratio_df = pd.DataFrame(data={'Explained Variance': list(pca.explained_variance_)}, 
                                 columns=['Explained Variance'],
                                 index=make_pca_labels(pca.n_components_))
variance_ratio_df

7. Interpret final PCA solution.

7.1. Execute PCA with chosen principal components count

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=4)

principal_components = pca.fit_transform(x)

principal_df = pd.DataFrame(data=principal_components, columns=make_pca_labels(pca.n_components_))
final_df = pd.concat([df[target], principal_df], axis=1)

final_df.head()


7.2. Loadings.

In [None]:
loadings = pd.DataFrame(pca.components_.T.round(3), columns=make_pca_labels(pca.n_components_), index=features)
loadings


In [None]:
# pca general form
for pca_id in range(1, pca.n_components_ + 1):
  print(f'W{pca_id} = ', sep='', end='')
  for feature, row in loadings.iterrows():
    print(f'({row[pca_id - 1]} * {feature})', end=' + ')
  print('\n')

7.3. Loadings Matrix.

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

loadings_matrix = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_), columns=make_pca_labels(pca.n_components_), index=features)

plt.figure(figsize=(8, 8))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(loadings_matrix, annot=True, 
            vmax=1, vmin=-1, center=0,
            cmap=cmap)
plt.show()


8. PCs visualize 2D Projection

In [None]:
from matplotlib import pyplot as plt
import random

#? @param pc_x: label of x component f'PC{index}'
#? @param pc_y: label of y component f'PC{index}'
#? @param targets: targets to be scattered
#? @param annot: if the scattered plots should have titles
#? @param legend: if the plot should have legend 
def plot_principal_components(pc_x, pc_y, targets, annot=False, legend=False):
  fig = plt.figure(figsize=(8, 8))
  
  ax = fig.add_subplot(1, 1, 1)
  ax.set_xlabel(pc_x, fontsize=15)
  ax.set_ylabel(pc_y, fontsize=15)
  ax.set_title(f'{pc_x} & {pc_y}', fontsize=20)

  for obs in targets:
    indices_to_keep = df[target] == obs

    x_points = final_df.loc[indices_to_keep, pc_x]
    y_points = final_df.loc[indices_to_keep, pc_y]

    ax.scatter(x_points, y_points, s=50)
    if annot:
      for x, y in zip(x_points, y_points):
        ax.annotate(obs, (x, y))

  if legend:
    ax.legend(targets)

  ax.grid()
  plt.show()

# targets = set(df[target])
random.seed(893927)
targets = set(random.sample(sorted(df[target].values), 10))
plot_principal_components('PC1', 'PC2', targets, annot=True)