# Analyse étendue de la concentration du pouvoir de vote

Ce notebook calcule plusieurs indicateurs de concentration du pouvoir pour chaque fichier de vote. Les mesures incluent :
- Coefficient de Gini
- Entropie de Shannon
- Indice de Theil
- Ratio du vote contrôlé par le top 1 %, 10 % et 50 %
- Indice de Herfindahl-Hirschman (HHI)

Les résultats sont ensuite corrélés avec le coefficient de Gini pour vérifier la cohérence des mesures. Seuls les fichiers contenant plus de **10 votants** sont pris en compte.


In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from tqdm import tqdm
from pandas.errors import ParserError, EmptyDataError

# Chemin vers le dossier contenant les fichiers de vote (CSV)
# Chaque fichier doit posséder une colonne 'voting_power' représentant la force de vote de chaque électeur
VOTES_DIR = 'votes_data_karina'

records = []
for filename in os.listdir(VOTES_DIR):
    if not filename.endswith('.csv'):
        continue
    file_path = os.path.join(VOTES_DIR, filename)
    try:
        df = pd.read_csv(file_path)
    except (ParserError, EmptyDataError):
        continue
    if 'voting_power' not in df.columns:
        continue
    df['voting_power'] = pd.to_numeric(df['voting_power'], errors='coerce')
    voting_power = df['voting_power'].dropna().astype(float)
    n = len(voting_power)
    if n <= 10:
        continue
    total_power = voting_power.sum()
    shares = voting_power / total_power

    def gini_coefficient(array):
        array = np.array(array)
        array = array[array > 0]
        if len(array) == 0:
            return None
        sorted_vp = np.sort(array)
        n_vals = len(array)
        index = np.arange(1, n_vals + 1)
        gini = (2 * np.sum(index * sorted_vp)) / (n_vals * np.sum(sorted_vp)) - (n_vals + 1) / n_vals
        return gini

    gini_coeff = gini_coefficient(voting_power.values)
    if gini_coeff is None:
        continue

    shannon_entropy = -(shares * np.log(shares + 1e-12)).sum()
    mean_power = voting_power.mean()
    theil = (voting_power / mean_power * np.log((voting_power + 1e-12) / mean_power)).mean()
    hhi = (shares ** 2).sum()

    df_sorted = voting_power.sort_values(ascending=False)
    top1 = df_sorted.iloc[:max(1, int(0.01 * n))].sum() / total_power
    top10 = df_sorted.iloc[:max(1, int(0.10 * n))].sum() / total_power
    top50 = df_sorted.iloc[:max(1, int(0.50 * n))].sum() / total_power

    dao = filename.split('_')[0]

    records.append({
        'proposal': filename,
        'dao': dao,
        'gini': gini_coeff,
        'shannon_entropy': shannon_entropy,
        'theil': theil,
        'hhi': hhi,
        'top1_share': top1,
        'top10_share': top10,
        'top50_share': top50,
    })


df_metrics = pd.DataFrame(records)
df_metrics.head()


In [None]:
# Matrice de corrélation par DAO
metrics = ['gini', 'shannon_entropy', 'theil', 'hhi', 'top1_share', 'top10_share', 'top50_share']

def corr_with_significance(df):
    corr = df[metrics].corr()
    pval = pd.DataFrame(index=metrics, columns=metrics, dtype=float)
    for i in metrics:
        for j in metrics:
            corr_val, p_val = pearsonr(df[i], df[j])
            pval.loc[i, j] = p_val
    signif_levels = {0.001: '***', 0.01: '**', 0.05: '*'}
    formatted = corr.copy().astype(str)
    for i in metrics:
        for j in metrics:
            mark = ''
            for th, star in signif_levels.items():
                if pval.loc[i, j] < th:
                    mark = star
                    break
            formatted.loc[i, j] = f"{corr.loc[i, j]:.2f}{mark}"
    return formatted

dao_corr = {}
for dao, grp in df_metrics.groupby('dao'):
    if len(grp) > 1:
        dao_corr[dao] = corr_with_significance(grp)

dao_corr


In [None]:
# Sauvegarde éventuelle des résultats
# df_metrics.to_csv('voting_power_metrics.csv', index=False)
