### Import

In [None]:
# Import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from importlib import reload
import analysis
# Import custom libraries
from util import UtilityFunctions as uf
from analysis import Analysis as an
reload(analysis)  

pd.DataFrame.filter_features = uf.filter_features
pd.DataFrame.filter_numerical_values = uf.filter_numerical_values
pd.DataFrame.filter_negative_values = uf.filter_negative_values
pd.DataFrame.filter_columns_with_less_unique_values_than_threshold = uf.filter_columns_with_less_unique_values_than_threshold
pd.DataFrame.drop_columns = uf.drop_columns

### Load processed data

In [None]:
path = './../data/processed/'

processed_files = uf.get_csv_files_from_folder(path)
for f in processed_files:
        print(f)

### Set hyperparameters for PCA

In [None]:
NUMBER_OF_FEATURES = 200
N_COMPONENTS = 3
feature_of_interest = 'E033' # Political self assessment scale V94, V114

### Set hyperparameters for KMeans

In [None]:
from sklearn.cluster import KMeans as km
N_Clusters = 2

### Run PCA on processed data and plot

In [None]:
plot_pca = False
plot_kmeans = True

for csv_file in processed_files:
        country_dataframe = pd.read_csv(csv_file)
        # Find the NUMBER_OF_FEATURES most correlated with the political scale feature
        corr = country_dataframe

        if '2018' in csv_file:
            corr = country_dataframe.drop(feature_of_interest, axis=1).corrwith(country_dataframe[feature_of_interest]).abs().sort_values(ascending=False)
            n_features = corr.head(NUMBER_OF_FEATURES).index
            filter_df = country_dataframe.filter_features(n_features)
            corr = filter_df
            plotting_data = (country_dataframe[feature_of_interest], 'Political self assessment scale', feature_of_interest)


        # Perform PCA
        (explained_variance, pca) = an.PCA(corr, N_COMPONENTS)

        # visualize PCA
        dataset_name = csv_file.split('/')[-1].split('.')[0]
        print(f"Explained variance for {dataset_name} with {n_features.shape[0]} features")
        print([f"{r * 100:.2f}" for r in explained_variance[0:N_COMPONENTS]])
        
        if plot_pca:
            if '2018' in csv_file:
                an.plot_PCA(pca, dataset_name, plotting_data)
            else:
                an.plot_PCA(pca, dataset_name, None)

        # Perform KMeans

        kmeans = km(n_clusters=N_Clusters)
        labels = kmeans.fit_predict(pca)
        centroids = kmeans.cluster_centers_
        print(f"KMeans for {dataset_name}")
        if plot_kmeans:        
            an.plot_kmeans(N_Clusters,pca,labels,dataset_name)
        # Plot KMeans
        # an.plot_KMeans(country_dataframe, dataset_name)

