# GHSI puzzle
## Variable transformation


Loading variables:


In [35]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
import sys
sys.path.append('./functions')
from DnRoutlier import DnRoutlier

ghsi_2019_df = pd.read_csv('data/mr_GHSI_demo_data_transf_out.csv')
ghsi_2021_df = pd.read_csv('data/GHSI_2021_large_table.csv', sep="\t")

ghsi_2019_df = ghsi_2019_df.set_index("Row")
ghsi_2021_df = ghsi_2021_df.set_index("Country")



Filtering by country:

In [28]:
common = ghsi_2019_df.index.intersection(ghsi_2021_df.index)
ghsi_2019_df = ghsi_2019_df.loc[common]
ghsi_2021_df = ghsi_2021_df.loc[common]

# Checking if the two are filtered correctly

ghsi_2019_df.index.equals(ghsi_2021_df.index)



True

Pulling names of countries and predictors into specific list for future use

In [30]:
country_names = ghsi_2021_df.index.tolist()
col_names = ghsi_2021_df.columns.to_list()

Transforming a table into a numeric matrix for further manipulation

In [32]:
ghsi_2021_matrix = ghsi_2021_df.to_numpy()


# Preparing data for boxcox transformation

ghsi_2021_matrix_transformed = np.zeros(ghsi_2021_matrix.shape)
lambdas = np.zeros(ghsi_2021_matrix.shape[1])

min_val = np.min(ghsi_2021_matrix)

if min_val <= 0:
    ghsi_2021_matrix = ghsi_2021_matrix - min_val + np.finfo(float).eps

for i in range(ghsi_2021_matrix.shape[1]):
    transformed_column, lambda_val = boxcox(ghsi_2021_matrix[:, i])
    ghsi_2021_matrix_transformed[:, i] = transformed_column
    lambdas[i] = lambda_val

Using DnRoutlier (Detect and Replace outlier) to replace outliers with the median for that predictor

In [36]:
ghsi_2021_matrix_transformed = DnRoutlier(ghsi_2021_matrix_transformed)

Saving the transformation into a table

In [38]:
ghsi_2021_table_transformed = pd.DataFrame(
    ghsi_2021_matrix_transformed,
    index=country_names,
    columns=col_names
)
