In [1]:
import pandas as pd
import numpy as np
import random
import operator
import math
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.cluster.hierarchy import dendrogram

In [4]:
# Read the dataset
file_path = 'RWB_02112023195802159.csv'
data = pd.read_csv(file_path)

# Filter the dataset for 'Disposable income per capita' and 'Life expectancy at birth'
income_life_exp_data = data[data['Indicator'].isin(['Disposable income per capita', 'Life expectancy at birth'])]

# Pivot the data to have one row per region with both indicators
pivot_data = income_life_exp_data.pivot_table(index=['REG_ID', 'Regions', 'TIME', 'Time'], 
                                              columns='Indicator', 
                                              values='Value').reset_index()

# Normalize the features
scaler = MinMaxScaler()
features_to_cluster = pivot_data[['Disposable income per capita', 'Life expectancy at birth']]
normalized_features = scaler.fit_transform(features_to_cluster)
normalized_df = pd.DataFrame(normalized_features, columns=features_to_cluster.columns)


In [54]:
import numpy as np
from scipy.stats import binned_statistic

def entropy(values, no_bins):
    entropy = 0
    range_of_bins = (max(values) - min(values)) / no_bins
    bin_groupings = binned_statistic(values, values, bins=no_bins, range=(0, 1))[2]    
    for i in range(0, len(values)):
        prob = bin_groupings.tolist().count(bin_groupings[i]) / len(bin_groupings) 
        entropy -= prob * math.log2(prob)
    return entropy

# def information_needed(values):
    

In [44]:
normalized_df.iloc[:,1]

0      0.711111
1      0.703704
2      0.644444
3      0.755556
4      0.755556
         ...   
385    0.637037
386    0.303704
387    0.644444
388    0.518519
389    0.540741
Name: Life expectancy at birth, Length: 390, dtype: float64

In [45]:
v = [np.std(normalized_df.iloc[:,i]) / np.average(normalized_df.iloc[:,i]) for i in range(0, len(normalized_df.columns))]

In [46]:
v

[0.635103346305578, 0.33645365983452524]

In [47]:
w = [v[i] / sum(v) for i in range(0, len(v))]

In [48]:
w

[0.653696429845922, 0.34630357015407803]

In [56]:
entropy(normalized_df.iloc[:,1], 6)

178.44485989569915

In [59]:
def ewm(df):
    df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
    print(f"df {df}")
    rows, cols = df.shape 
    k = 1.0 / math.log(rows)
    
    p = df / df.sum(axis=0)
    lnf = -np.log(p , where = df!=0 )*p*k
    
    d = 1 - lnf.sum(axis=0)
    w = d / d.sum()
    
    w = pd.DataFrame(w)
    w = w.round(5)
    w.index = df.columns
    w.columns =['weight']
    return w

In [60]:
ewm(normalized_df)

df Indicator  Disposable income per capita  Life expectancy at birth
0                              0.364875                  0.711111
1                              0.381422                  0.703704
2                              0.357903                  0.644444
3                              0.351231                  0.755556
4                              0.358270                  0.755556
..                                  ...                       ...
385                            0.647735                  0.637037
386                            0.478299                  0.303704
387                            0.575244                  0.644444
388                            0.696057                  0.518519
389                            0.591757                  0.540741

[390 rows x 2 columns]


Unnamed: 0_level_0,weight
Indicator,Unnamed: 1_level_1
Disposable income per capita,0.77113
Life expectancy at birth,0.22887


In [82]:
# Standardizing values at each index
p = normalized_df.copy(deep=True)
for c in range(0, len(normalized_df.columns)):
    for d in range(0, len(normalized_df)):
        p.iloc[d, c] = normalized_df.iloc[d, c] / sum(normalized_df.iloc[:, c])
        
# Entropy value at each index
E = []
for c in range(0, len(p.columns)):
    s = 0
    for d in range(0, len(p)):
        if (p.iloc[d, c] != 0):
            s += p.iloc[d, c] * math.log(p.iloc[d, c])
    E.append(-s / len(normalized_df))

w = []
denom_w = 0
for i in range(0, len(E)):
    denom_w += 1 - E[i]
for i in range(0, len(E)):
    w.append((1 - E[i]) / denom_w)

In [None]:
w