# <font color="#49699E" size=40>Latent Factors & Components</font>
# LEARNING OBJECTIVES
# LEARNING MATERIALS


# INTRODUCTION


## Imports and Data Preparation


In [None]:
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
import numpy as np
from scipy.stats import zscore
import random

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from dcss.plotting import format_axes_commas, custom_seaborn
custom_seaborn()

In [None]:
df = pd.read_csv('../data/vdem/V-Dem-CY-Full+Others-v10.csv', low_memory=False) 
df = df.query('year == 2019').reset_index()
df.shape

In [None]:
indicators = [
    'v2dlreason', 'v2dlcommon', 'v2dlcountr', 'v2dlconslt', 'v2dlengage',
    'v2dlencmps', 'v2dlunivl', 'v2cseeorgs', 'v2csreprss', 'v2cscnsult',
    'v2csprtcpt', 'v2csgender', 'v2csantimv', 'v2csrlgrep', 'v2csrlgcon',
    'v2mecenefm', 'v2mecenefi', 'v2mecenefibin', 'v2mecrit', 'v2merange',
    'v2mefemjrn', 'v2meharjrn', 'v2meslfcen', 'v2mebias', 'v2mecorrpt',
    'v2exrescon', 'v2exbribe', 'v2exembez', 'v2excrptps', 'v2exthftps',
    'v2cldiscm', 'v2cldiscw', 'v2clacfree', 'v2clrelig', 'v2clfmove'
]

In [None]:
countries = df['country_name'].tolist()
df = df.set_index('country_name')[indicators]
df.shape

In [None]:
df.isna().sum().value_counts()

In [None]:
X = df.to_numpy() 

# LATENT VARIABLES AND THE CURSE OF DIMENSIONALITY


## Theory First: Measuring Latent Variables with Exploratory Factor Analysis


# Conducting a Principal Component Analysis in Sklearn
## Standardization


In [None]:
X = StandardScaler().fit_transform(X) 

### A Brief Refresher on Variance, Standard Deviation, and Z-score Normalization


In [None]:
ABCD = {
    'A': [1, 1, 1, 1, 1], # no variance...
    'B': [1, 2, 3, 4, 5], # some variance...
    'C': [-1, 1, 3, 5, 7], # a bit more variance...
    'D': [-10, -9, 3, 4, 4] # still more variance...
}

for k, v in ABCD.items():
    print(f'{k} has a variance of {np.round(np.var(v), 3)}.')

In [None]:
for k, v in ABCD.items():
    print(f'{k} has a standard deviation of {np.round(np.std(v), 3)}.')

In [None]:
for k, v in ABCD.items():
    print(f'The values in {k} have the following Z-scores: {np.round(zscore(v), 3)}.')

## Back to PCA!


In [None]:
pca = PCA()
pca_results = pca.fit_transform(X)

In [None]:
res = pd.DataFrame(pca_results, index=countries)
res.columns=[f'PC {i}' for i in res.columns]

res['PC 0'].head()

In [None]:
evr = pca.explained_variance_ratio_
evr

In [None]:
print(f'The sum of the array is: {np.round(np.sum(evr), 2)}')

In [None]:
np.sum(evr[:3]) 

In [None]:
cve = pd.Series(np.cumsum(evr))
cve[:12]

In [None]:
fig, ax = plt.subplots()
sns.lineplot(x=cve.index, y=cve)
plt.scatter(x=cve.index, y=cve)
ax.set(xlabel='Principal component ID',
       ylabel='Proportion of explained variance (cumulative)')
ax.set(ylim=(0, 1.1))
sns.despine()
plt.show()

## Matrix Decomposition: Eigenvalues, Eigenvectors, and Extracting Components


In [None]:
eigenvalues = pd.Series(pca.explained_variance_)

fig, ax = plt.subplots()
sns.lineplot(x=eigenvalues.index, y=eigenvalues, data=eigenvalues)
plt.scatter(x=eigenvalues.index, y=eigenvalues)
ax.set(xlabel='Principal component ID', ylabel='Eigenvalue')
sns.despine()
plt.show()

In [None]:
eigenvalues.head(10)

In [None]:
component_1 = pca_results[:, 0]
component_2 = pca_results[:, 1]

PC12 = pd.DataFrame(zip(component_1, component_2), columns=['PC1', 'PC2'])

In [None]:
PC12['Country'] = countries

In [None]:
ax = sns.kdeplot(data=PC12, x='PC1', y='PC2', alpha=.8, fill=True)
for i, country in enumerate(PC12['Country']):
    ax.text(PC12['PC1'][i],
            PC12['PC2'][i],
            country,
            horizontalalignment='left',
            size=3,
            color='black',
            weight='normal')
ax.set(xticklabels=[], yticklabels=[])
ax.set(
    xlabel=
    f'$\longleftarrow$ PC1 (eigenvalue: {np.round(eigenvalues.loc[0], 2)}) $\longrightarrow$',
    ylabel=
    f'$\longleftarrow$ PC2 (eigenvalue: {np.round(eigenvalues.loc[1], 2)}) $\longrightarrow$'
)
plt.show()

# CONCLUSION
## Key Points 
