## PCA

### Loading transformed GHSI 2021 data, and 2019 data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sys
sys.path.append('./functions')
from pcs_threshold import pcs_threshold
from scipy.stats import pearsonr

ghsi_2019_df = pd.read_csv("data/ghsi_2019_rearanged.csv")
ghsi_2021_transformed = pd.read_csv("data/ghsi_2021_transformed.csv")


Before conducting PCA, variables were grouped based on their conceptual meaning and intercorrelations, enabling easier interpretation of the obtained components.

In [3]:
GHSI_2021 = ghsi_2021_transformed.iloc[:, 1:]
GHSI_2019 = ghsi_2019_df.iloc[:, 1:8]
prosperity = ghsi_2019_df.iloc[:, [10, 11, 13, 14, 15]]
age = ghsi_2019_df.iloc[:, [12, 18, 19]]
disease = ghsi_2019_df.iloc[:, [22, 23, -1]]


Putting all the groups into a list so we can performe a for loop in order to do PCA on all of the groups simultaneously

In [4]:
all_groups = [GHSI_2021, GHSI_2019, prosperity, age, disease]
names = ['GHSI 2021', 'GHSI 2019', 'Prosperity', 'Age', 'Disease']


PCA

In [5]:
resultsPCA = {}
for i, group in enumerate(all_groups):

    # Normalizing data
    scaler = StandardScaler()
    normal_data = scaler.fit_transform(group)

    #PCA
    pca = PCA()
    score = pca.fit_transform(normal_data)
    explained = pca.explained_variance_ratio_ * 100
    latent = pca.explained_variance_

    nPCs = pcs_threshold(explained, 85)

    # saving results into a dictionary 

    resultsPCA[names[i]] = {

        'coeff' : pca.components_[:nPCs, :].T,
        'score' : score[:, :nPCs],
        'latent': latent[:nPCs],
        'explained' : explained[:nPCs],
        'nPCs': nPCs
    }

Exporting PCA results into separate CSVs in case we need them further down the line

In [6]:
PCA_table = {}
for name in names:
    df = pd.DataFrame(
        resultsPCA[name]['score'],
        columns = [f"{name} PC{i+1}" for i in range(resultsPCA[name]['nPCs'])]
    )

    PCA_table[name] = df

    csv_name = f"{name.replace(' ', '_')}_PCS.csv"
    df.to_csv(f"data/{csv_name}")

print(PCA_table['GHSI 2021'].head())    


   GHSI 2021 PC1  GHSI 2021 PC2  GHSI 2021 PC3
0      -2.970420       1.542626      -0.915482
1      -0.897239      -0.620626       0.384835
2       1.986107       1.343400       0.971290
3       4.426760       0.538022       0.042024
4       2.269115      -0.858056      -0.904512


## Correlation analysis between different predictors, groups of predictors and principal components

### Correlation between predictors that are in the make up of groups that went into PCA
We did this in order to see if our slightly arbitrary assumption of similarity between different variables makes sense statisticlly 

In [7]:
groups_vs_PCs_names = ["GHSI 2021 vs PC", "GHSI 2019 vs PC", "Prosperity vs PC", "Age vs PC", "Disease vs PC",]
corr_results_groups_vs_PCs = {}

PCA_data = [PCA_table[name] for name in names]

for i in range(len(all_groups)):
    combined_df = pd.concat([all_groups[i].reset_index(drop=True),
                             PCA_data[i].reset_index(drop=True)],
                             axis=1)

    cols = combined_df.columns
    n = len(cols)

    coeff = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)
    p_values = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)


    for r in range(n):
        for c in range(n):
            r_val, p_val = pearsonr(
                combined_df.iloc[:, r],
                combined_df.iloc[:, c]
            )
            coeff.iloc[r, c] = r_val
            p_values.iloc[r, c] = p_val

    corr_results_groups_vs_PCs[groups_vs_PCs_names[i]] = {
        'coeff': coeff,
        'p_values': p_values
    }
    
#test
corr_results_groups_vs_PCs['Age vs PC']['p_values']




Unnamed: 0,MA,CH,AL,Age PC1,Age PC2
MA,0.0,1.283204e-16,2.182352e-15,1.610366e-36,0.716502
CH,1.283204e-16,0.0,4.213032e-11,9.896753e-30,0.000149
AL,2.182352e-15,4.213032e-11,0.0,2.20663e-28,1.9e-05
Age PC1,1.610366e-36,9.896753e-30,2.20663e-28,0.0,1.0
Age PC2,0.7165017,0.0001486571,1.85884e-05,1.0,0.0


### Correlation between different principal components and our response variable m/r.

We did this in order to see which principal components will be kept for further analysis. Only the ones that are statistically significant (p<0.05) will be kept.

In [8]:
m_r = GHSI_2019.iloc[:, 0]
corr_results_m_r_vs_pcs = {}
mr_vs_PCs_names = ['GHSI 2021 PC vs mr', 'GHSI 2019 PC vs mr', 'Prosperity vs mr', 'Age vs mr', 'Disease vs mr']

for i in range(len(PCA_data)):
    X = pd.concat([m_r.reset_index(drop=True),
                             PCA_data[i].reset_index(drop=True)],
                             axis=1)
    
    cols = X.columns
    n = len(cols)

    coeff = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)
    p_values = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)

    for r in range(n):
        for c in range(n):
            r_val, p_val = pearsonr(
                X.iloc[:, r],
                X.iloc[:, c]
            )
            coeff.iloc[r, c] = r_val
            p_values.iloc[r, c] = p_val

    corr_results_m_r_vs_pcs[mr_vs_PCs_names[i]] = {
        'coeff' : coeff,
        'p_values' : p_values        
    }

# test
corr_results_m_r_vs_pcs['GHSI 2019 PC vs mr']['p_values']

Unnamed: 0,mr,GHSI 2019 PC1,GHSI 2019 PC2,GHSI 2019 PC3
mr,0.0,3.835291e-09,2.233898e-20,0.715479
GHSI 2019 PC1,3.835291e-09,0.0,1.0,1.0
GHSI 2019 PC2,2.233898e-20,1.0,0.0,1.0
GHSI 2019 PC3,0.7154787,1.0,1.0,0.0


### Correlation between predictors that are not a part of any group, and the response variable

In [10]:
variables = ['IE','RE','OB','SM','IN','BCG','ON']
corr_results_m_r_vs_variables = {}
mr_vs_variables_names = ['IE vs mr', 'RE vs mr', 'OB vs mr', 'SM vs mr', 'IN vs mr', 'BCG vs mr', 'ON vs mr']

for i in range(len(variables)):
    variable = variables[i]
    X = pd.concat([m_r.reset_index(drop=True),
                  ghsi_2019_df.loc[:, variable].reset_index(drop=True)], axis=1)
    
    cols = X.columns
    n = len(cols)

    coeff = pd.DataFrame(np.zeros((n,n)), index=cols, columns=cols)
    p_values = pd.DataFrame(np.zeros((n,n)), index=cols, columns=cols)

    for r in range(n):
        for c in range(n):
            r_val, p_val = pearsonr(
                X.iloc[:, r],
                X.iloc[:, c]
            )

            coeff.iloc[r, c] = r_val
            p_values.iloc[r,c] = p_val
    
    corr_results_m_r_vs_variables[mr_vs_variables_names[i]] = {
        'coeff' : coeff,
        'p_values' : p_values
    }

# test
corr_results_m_r_vs_variables['IE vs mr']['p_values']

Unnamed: 0,mr,IE
mr,0.0,0.602276
IE,0.602276,0.0


### Finding statisticlly significant variables and keeping them

In [None]:
alpha = 0.05
significant_components = {}
for name in mr_vs_PCs_names:
    p_values_table = corr_results_m_r_vs_pcs[name]['p_values']

    without_mr = p_values_table.loc['mr'].drop('mr')
    significant_pcs = without_mr[without_mr < alpha]

    if not significant_pcs.empty:
        significant_components[name] = {
            'significant_PCs': significant_pcs.index.to_list(),
            'coeff': corr_results_m_r_vs_pcs[name]['coeff'].loc[
                significant_pcs.index, 'mr'],
            'p_values': significant_pcs

        }

#test
significant_components

{'GHSI 2021 PC vs mr': {'significant_PCs': ['GHSI 2021 PC1'],
  'coeff': GHSI 2021 PC1    0.468803
  Name: mr, dtype: float64,
  'p_values': GHSI 2021 PC1    0.000006
  Name: mr, dtype: float64},
 'GHSI 2019 PC vs mr': {'significant_PCs': ['GHSI 2019 PC1', 'GHSI 2019 PC2'],
  'coeff': GHSI 2019 PC1    0.585945
  GHSI 2019 PC2    0.803350
  Name: mr, dtype: float64,
  'p_values': GHSI 2019 PC1    3.835291e-09
  GHSI 2019 PC2    2.233898e-20
  Name: mr, dtype: float64},
 'Prosperity vs mr': {'significant_PCs': ['Prosperity PC1'],
  'coeff': Prosperity PC1    0.369726
  Name: mr, dtype: float64,
  'p_values': Prosperity PC1    0.000497
  Name: mr, dtype: float64},
 'Age vs mr': {'significant_PCs': ['Age PC1'],
  'coeff': Age PC1    0.519266
  Name: mr, dtype: float64,
  'p_values': Age PC1    3.545416e-07
  Name: mr, dtype: float64},
 'Disease vs mr': {'significant_PCs': ['Disease PC1'],
  'coeff': Disease PC1   -0.310566
  Name: mr, dtype: float64,
  'p_values': Disease PC1    0.003819
 

In [None]:
significant_variables = {}
for name in mr_vs_variables_names:
    p_values_table = corr_results_m_r_vs_variables[name]['p_values']

    without_mr = p_values_table.loc['mr'].drop('mr')
    significant_pcs = without_mr[without_mr < alpha]

    if not significant_pcs.empty:
        significant_components = {
            'significant_PCs' : significant_pcs.index.to_list(),
            'coeff' : corr_results_m_r_vs_variables[name]['coeff'].loc[
                significant_pcs.index, 'mr'],
            'p_values': significant_pcs
        }

significant_variables

{}