### Import

In [None]:
# Import
import pandas as pd

# Import custom libraries
from util import UtilityFunctions as uf
from analysis import Analysis as an


### Load processed data

In [None]:
path = './../data/processed/'

processed_files = uf.get_csv_files_from_folder(path)
for f in processed_files:
        print(f)

### Set hyperparameters for PCA

In [None]:
NUMBER_OF_FEATURES = 200
N_COMPONENTS = 3
feature_of_interest = 'E033' # Political self assessment scale

### Run PCA on processed data and plot

In [None]:
for csv_file in processed_files:
        country_dataframe = pd.read_csv(csv_file)
        # Find the NUMBER_OF_FEATURES most correlated with the political scale feature
        corr = country_dataframe.drop(feature_of_interest, axis=1).corrwith(country_dataframe[feature_of_interest]).abs().sort_values(ascending=False)
        n_features = corr.head(NUMBER_OF_FEATURES).index
        filter_df = country_dataframe.filter_features(n_features)

        # Perform PCA
        (explained_variance, pca) = an.PCA(filter_df, N_COMPONENTS)

        # visualize PCA
        plotting_data = (country_dataframe[feature_of_interest], 'Political self assessment scale', feature_of_interest)
        dataset_name = csv_file.split('/')[-1].split('.')[0]
        print(f"Explained variance for {dataset_name} with {n_features.shape[0]} features")
        print([f"{r * 100:.2f}" for r in explained_variance[0:N_COMPONENTS]])
        an.plot_PCA(pca, plotting_data, dataset_name)