In [16]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Promethee for the HDI dataset
First approach to applying PROMETHEE method to the HDI dataset (multivariate)

1. Read the dataset and transform it in the good format

In [None]:
def transform_cols_in_sequence(data, col_id, range_of_years):
    cols = [col_id + year for year in range_of_years]

    # Transform col_id to remove "_"
    col_id = col_id.replace("_", "")

    # Add a new column named col_id (will be the sequence)
    data[col_id] = None

    for i, row in data.iterrows(): # For each row
        sequence = []
        for col in cols: # For each column
            sequence.append(row[col])
        sequence = np.array(sequence) # Transform the sequence into a numpy array
        data.at[i, col_id] = sequence # Replace the value of the new column with the sequence

    # Drop the columns that were transformed
    data = data.drop(columns=cols)

    return data

def read_data():
    PATH = "../data/HDI/HDR23-24_Composite_indices_complete_time_series.csv"
    data = pd.read_csv(PATH, encoding='latin1')
    data = data.dropna()

    years = [str(i) for i in range(1990, 2022)]

    fixed_col_names_to_keep = ['iso3', "hdi_rank_2022"]
    var_col_names_to_keep = ["co2_prod_", "pop_total_", "hdi_", "le_", "gdi_"]

    col_names_to_keep = fixed_col_names_to_keep + [var + year for var in var_col_names_to_keep for year in years]

    data = data[col_names_to_keep]

     # Transform the variable columns into sequences
    for col in var_col_names_to_keep: 
        data = transform_cols_in_sequence(data, col, years)

    return data


data = read_data()

In [None]:
def plot_data(data):
    """ 
    For each column, plot a graph with one color per row (country in this case)
    On the x-axis, the years (1990-2022)
    Using subplots
    """
    Nb_cols = len(data.columns) - 2
    fig, axs = plt.subplots(Nb_cols, 1, figsize=(10, 20))

    for i, col in enumerate(data.columns[2:]):
        for j, row in data.iterrows():
            axs[i].plot(row[col], label=row["iso3"])
        axs[i].set_title(col)
    plt.show()

# plot_data(data)
    

In [28]:
def scale_column(data, col):
    """
    Scale a column between 0 and 1 knowing that each row is a sequence np.array
    Scaling using the min and max of the column and not of the sequence each time!
    """
    min_val = data[col].apply(lambda x: x.min()).min()
    max_val = data[col].apply(lambda x: x.max()).max()
    data[col] = data[col].apply(lambda x: (x - min_val) / (max_val - min_val))
    return data

def scale_data(data):
    """
    Scale all the columns that are sequences
    """
    for col in data.columns[2:]:
        data = scale_column(data, col)
    return data

data = scale_data(data)

## 2. Applying Promethee
Now that the Data is scaled and in the correct format, let's apply PROMETHEE. Let's define the functions that will be needed:
- Difference between two time series `d_ab`
- The Information within criterion $P_j(a,b)$ (here let's do a simple linear one)
- The Aggregated preference $\Pi(a,b)$



In [32]:
K = len(data.columns) - 2 # 5
N = len(data)
P = [0.9, 0.9, 0.9, 0.9, 0.9] # Preference parameters
Q = [-0.1, -0.1, -0.1, -0.1, -0.1] # Indifference parameters
W = [0.2, 0.2, 0.2,0.2,0.2] # Weights

def d_ab(t1, t2):
    """ 
        Difference between two time series
        Parameters:
            t1: np.array
            t2: np.array
        Returns:
            np.array
    """
    return t1 - t2

def Pj_ab_linear(t1, t2, c):
    """ 
        Pj(a,b) for criteria k=c
        t1, t2: time series (one criteria)
        c: index of the criteria for the pref/indif threshold
        Returns the value of the preference function for this criteria
    """
    d = d_ab(t1, t2)
    for i in range(len(d)):
        if d[i] <= Q[c]:
            d[i] = 0
        elif d[i] > P[c]:
            d[i] = 1
        else:
            d[i] = (d[i] - Q[c]) / (P[c] - Q[c])
    return d

def PI_ab(t1, t2):
    """ 
        PI(a,b) for two time series
        t1, t2: rows of the data (two countries in this case)
        Returns the value of the preference function for the two time series
    """
    cols = t1.index[2:] # Get the columns that are sequences (criteria)
    res = 0
    for c in range(K):
        res += W[c] * Pj_ab_linear(t1[cols[c]], t2[cols[c]], c)
    return res

In [35]:
row1 = data.iloc[0]
row2 = data.iloc[1]

print(PI_ab(row1, row2))
print(PI_ab(row2, row1))


[0.39504682 0.40965987 0.38229018 0.37869231 0.37228879 0.36293946
 0.3660299  0.34972988 0.33268015 0.32126042 0.31953806 0.31829336
 0.31615153 0.3251108  0.32906611 0.32599219 0.33347954 0.32879092
 0.33318472 0.32482064 0.31220747 0.31149859 0.30536173 0.29469219
 0.29433059 0.28598672 0.27008202 0.26327575 0.25666627 0.24765926
 0.23385179 0.2304661 ]
[0.03060987 0.03084155 0.03106465 0.03130497 0.03155198 0.03179705
 0.03203062 0.03227778 0.03254748 0.03283006 0.03311781 0.0334122
 0.03370622 0.03398678 0.03425674 0.03450183 0.03468835 0.0348461
 0.03497251 0.03509709 0.03527773 0.03547697 0.03568316 0.03589727
 0.0361102  0.03631464 0.03653441 0.03676552 0.03698353 0.03719717
 0.03742703 0.04227662]


In [None]:
def PI_matrix(data):
    """ 
        Compute a matrix of size NxN where N is the number of rows in the data
        The matrix will contain the PI_ab sequence of values for each pair of rows
            - PI_matrix[i][j] will contain the PI_ab value for the rows i and j
            - PI_matrix[i][i] will contain a series of 0
    """
    size_of_sequence = data.iloc[1][3].shape[0]
    # PI matrix is a matrix of sequences of size
    PI_matrix = np.zeros((N, N, size_of_sequence))
    for i in range(N):
        for j in range(N):
            if i != j: # If the two rows are different as if i=j, PI_ab = 0
                PI_matrix[i][j] = PI_ab(data.iloc[i], data.iloc[j])
    return PI_matrix

PI_mat= PI_matrix(data)

  size_of_sequence = data.iloc[1][3].shape[0]


## Compute the net flow scores

In [None]:
def compute_ranking(PI):
    """
    Compute the ranking of the countries based on the PI matrix
    - The ranking is the sum of the PI values for each row
    Parameters:
        PI: np.array
    Returns:
        np.array
    """
    return np.argsort(np.sum(PI, axis=1))

ranking = compute_ranking(PI_mat) # Ranking of the countries based on the PI matrix

print(ranking)

[[30 31 29 28 27 26 25 24 23 22 21 20 13 17 12 19 15 14 18 11 16 10  7  9
   5  6  8  4  3  2  0  1]
 [ 1  0  2  3  4  6  5 17 18 16 19  7 21 13 20 14 15 22 24 25 23 12 26 11
  10 27  8  9 28 29 30 31]
 [30 31 28 29 27 24 26 25 22 20 23 21 17 19 16  3 11 18 13 14  5  4 15  0
   1 12  2  6  9 10  7  8]
 [24 23 14  3 22 27 21 25 30 28 15 20 16 19 26 17 18 29  9 31 13 11  0 10
   1  7  8  4  2  5  6 12]
 [19 29 30 31 23 28 25 24 18 20 26 27 22 21 16 17 15 14 13 12  6 11  9  2
  10  0  1  5  3  4  7  8]
 [27 28 29 26 31 25 30 24 23 22 21 20 18 14 15 19 17 11 13 16 10 12  7  9
   5  6  4  3  0  2  8  1]
 [ 0  1  2  3  4  5  6  7  9  8 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  31 24 25 29 30 26 28 27]
 [ 0  2  1  3  7  5  4  6  8 10  9 11 13 12 31 30 14 25 23 29 18 15 20 26
  19 21 24 16 22 27 17 28]
 [30 31 29 28 27 26 25 24 23 22 21 20 19 18 17 16 14 15 12 10 13 11  9  0
   5  7  6  4  3  2  8  1]
 [ 2  3  0  4  1  7  9 10  6  5 11  8 12 20 21 13 22 16 23 15 18 14 17 19
  24 26 27 25 28 