# Remake the EDA notebook for the streamlit app

### Librairy imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import scipy as sp
import plotly.express as xp

### Load the cleaned csv file

In [None]:
# Import the csv file
df = pd.read_csv("../app/clean_dataset_legislative_2024.csv", sep=";")



## Number of candidates analysis (Univariate and Multivariate)

In [None]:
count_by_candidates = df.groupby("Nombre_candidats").agg(
    Number_of_Cities= ("Libellé_commune", "size"),
    Average_Inscrits = ("Inscrits", "mean")
).reset_index()
count_by_candidates.head(15)

In [4]:
import plotly.express as px

# Assuming 'count_by_candidates' is a Pandas DataFrame
fig = px.bar(count_by_candidates, 
            x="Nombre_candidats", 
            y="Number_of_Cities",
            title="Number of Cities vs. Nombre de Candidats",
            labels={'Nombre_candidats':'Nombre de Candidats', 
            'Number_of_Cities':'Number of Cities'}
            )
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [5]:
# Most second turn confrontations that happen are either duels or triangles followed by 4 candidates
df_filtered = df[df["Nombre_candidats"] > 7].sort_values(by="Inscrits", ascending=False).reset_index()
df_filtered.head(15)

Unnamed: 0,index,Code_département,Libellé_département,Code_commune,Libellé_commune,Inscrits,Votants,Pourcentage_Votants,Abstentions,Pourcentage_Abstentions,...,Numéro_de_panneau_18,Nuance_candidat_18,Nom_candidat_18,Prénom_candidat_18,Sexe_candidat_18,Voix_18,Pourcentage_Voix/inscrits_18,Pourcentage_Voix/exprimés_18,Elu_18,Nombre_candidats
0,26004,75,Paris,75056,Paris,687417,459856,66.9,227561,33.1,...,12.0,UG,ROSSET,Marine,FEMININ,21784.0,3.17,5.0,False,18
1,3167,13,Bouches-du-Rhône,13055,Marseille,400366,258783,64.64,141583,35.36,...,,,,,,,,,False,10
2,24074,69,Rhône,69123,Lyon,225440,166294,73.76,59146,26.24,...,,,,,,,,,False,8
3,10580,31,Haute-Garonne,31555,Toulouse,196238,135652,69.13,60586,30.87,...,,,,,,,,,False,12
4,11133,33,Gironde,33063,Bordeaux,166625,114734,68.86,51891,31.14,...,,,,,,,,,False,8
5,23522,67,Bas-Rhin,67482,Strasbourg,142722,97767,68.5,44955,31.5,...,,,,,,,,,False,8
6,20039,59,Nord,59350,Lille,122500,81459,66.5,41041,33.5,...,,,,,,,,,False,13
7,14964,44,Loire-Atlantique,44109,Nantes,106987,73390,68.6,33597,31.4,...,,,,,,,,,False,12
8,11688,34,Hérault,34172,Montpellier,101456,64608,63.68,36848,36.32,...,,,,,,,,,False,8
9,17221,51,Marne,51454,Reims,101169,59940,59.25,41229,40.75,...,,,,,,,,,False,8


### Add the region column

In [6]:
# Use mapping to add the regions of the departement
departements_regions = {
    "01": "Auvergne-Rhône-Alpes",
    "02": "Hauts-de-France",
    "03": "Auvergne-Rhône-Alpes",
    "04": "Provence-Alpes-Côte d'Azur",
    "05": "Provence-Alpes-Côte d'Azur",
    "06": "Provence-Alpes-Côte d'Azur",
    "07": "Auvergne-Rhône-Alpes",
    "08": "Grand Est",
    "09": "Occitanie",
    "10": "Grand Est",
    "11": "Occitanie",
    "12": "Occitanie",
    "13": "Provence-Alpes-Côte d'Azur",
    "14": "Normandie",
    "15": "Auvergne-Rhône-Alpes",
    "16": "Nouvelle-Aquitaine",
    "17": "Nouvelle-Aquitaine",
    "18": "Centre-Val de Loire",
    "19": "Nouvelle-Aquitaine",
    "2A": "Corse",
    "2B": "Corse",
    "21": "Bourgogne-Franche-Comté",
    "22": "Bretagne",
    "23": "Nouvelle-Aquitaine",
    "24": "Nouvelle-Aquitaine",
    "25": "Bourgogne-Franche-Comté",
    "26": "Auvergne-Rhône-Alpes",
    "27": "Normandie",
    "28": "Centre-Val de Loire",
    "29": "Bretagne",
    "30": "Occitanie",
    "31": "Occitanie",
    "32": "Occitanie",
    "33": "Nouvelle-Aquitaine",
    "34": "Occitanie",
    "35": "Bretagne",
    "36": "Centre-Val de Loire",
    "37": "Centre-Val de Loire",
    "38": "Auvergne-Rhône-Alpes",
    "39": "Bourgogne-Franche-Comté",
    "40": "Nouvelle-Aquitaine",
    "41": "Centre-Val de Loire",
    "42": "Auvergne-Rhône-Alpes",
    "43": "Auvergne-Rhône-Alpes",
    "44": "Pays de la Loire",
    "45": "Centre-Val de Loire",
    "46": "Occitanie",
    "47": "Nouvelle-Aquitaine",
    "48": "Occitanie",
    "49": "Pays de la Loire",
    "50": "Normandie",
    "51": "Grand Est",
    "52": "Grand Est",
    "53": "Pays de la Loire",
    "54": "Grand Est",
    "55": "Grand Est",
    "56": "Bretagne",
    "57": "Grand Est",
    "58": "Bourgogne-Franche-Comté",
    "59": "Hauts-de-France",
    "60": "Hauts-de-France",
    "61": "Normandie",
    "62": "Hauts-de-France",
    "63": "Auvergne-Rhône-Alpes",
    "64": "Nouvelle-Aquitaine",
    "65": "Occitanie",
    "66": "Occitanie",
    "67": "Grand Est",
    "68": "Grand Est",
    "69": "Auvergne-Rhône-Alpes",
    "70": "Bourgogne-Franche-Comté",
    "71": "Bourgogne-Franche-Comté",
    "72": "Pays de la Loire",
    "73": "Auvergne-Rhône-Alpes",
    "74": "Auvergne-Rhône-Alpes",
    "75": "Île-de-France",
    "76": "Normandie",
    "77": "Île-de-France",
    "78": "Île-de-France",
    "79": "Nouvelle-Aquitaine",
    "80": "Hauts-de-France",
    "81": "Occitanie",
    "82": "Occitanie",
    "83": "Provence-Alpes-Côte d'Azur",
    "84": "Provence-Alpes-Côte d'Azur",
    "85": "Pays de la Loire",
    "86": "Nouvelle-Aquitaine",
    "87": "Nouvelle-Aquitaine",
    "88": "Grand Est",
    "89": "Bourgogne-Franche-Comté",
    "90": "Bourgogne-Franche-Comté",
    "91": "Île-de-France",
    "92": "Île-de-France",
    "93": "Île-de-France",
    "94": "Île-de-France",
    "95": "Île-de-France",
    "971": "Outre-Mer",
    "972": "Outre-Mer",
    "973": "Outre-Mer",
    "974": "Outre-Mer",
    "975": "Outre-Mer",
    "976": "Outre-Mer",
    "987": "Outre-Mer",
    "988": "Outre-Mer",
    "ZX": "Outre-Mer",
    "ZZ": "Étranger",  
}

In [7]:
# Adding a "0" value before the one digit numbers
df["Code_département"] = df["Code_département"].astype(str).apply(lambda x: "0" + x if len(x) == 1 else x)
uniques_departements = df["Code_département"].unique()
print(f"List of the uniques_departements: {uniques_departements}")
# Create the new Region column with mapping
df['Libellé_Région'] = df['Code_département'].map(departements_regions)
# Display the unique regions 
unique_regions = df['Libellé_Région'].unique()
print(f"List of the unique_regions: {unique_regions}")

List of the uniques_departements: ['01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12' '13' '14'
 '15' '16' '17' '18' '19' '21' '22' '23' '24' '25' '26' '27' '28' '29'
 '2A' '2B' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '55'
 '56' '57' '58' '59' '60' '61' '62' '63' '64' '65' '66' '67' '68' '69'
 '70' '71' '72' '73' '74' '75' '76' '77' '78' '79' '80' '81' '82' '83'
 '84' '85' '86' '87' '88' '89' '90' '91' '92' '93' '94' '95' '971' '972'
 '973' '974' '975' '976' '987' '988' 'ZX' 'ZZ']
List of the unique_regions: ['Auvergne-Rhône-Alpes' 'Hauts-de-France' "Provence-Alpes-Côte d'Azur"
 'Grand Est' 'Occitanie' 'Normandie' 'Nouvelle-Aquitaine'
 'Centre-Val de Loire' 'Bourgogne-Franche-Comté' 'Bretagne' 'Corse'
 'Pays de la Loire' 'Île-de-France' 'Outre-Mer' 'Étranger']


In [23]:
# Create a new dataset to store the values we wanna proceed in the analysis
df_candidates = pd.concat([
                                df['Nom_candidat_' + str(i)] + ' ' + 
                                df['Prénom_candidat_' + str(i)] + '#' +
                                df['Sexe_candidat_' + str(i)] + '#' +
                                df['Nuance_candidat_' + str(i)] + '#' +
                                df['Voix_' + str(i)].astype(str) + '#' +
                                df['Pourcentage_Voix/exprimés_' + str(i)].astype(str) + "#" +
                                df['Elu_' + str(i)].astype(str) + '#' + 
                                df['Libellé_Région'] + '#' +
                                df['Libellé_département'] + '#' +
                                df['Libellé_commune'] 
                                for i in range(1, 19)], ignore_index=True)
# Drop any NaN values
df_candidates.dropna(inplace=True)
# Split the combined data and keep only 'Nom_complet' and 'Sexe'
df_candidates = df_candidates.str.split('#', expand=True)
df_candidates.columns = ['Nom_complet', 'Sexe', 'Nuance', 'Voix', 'Pourcentage_Voix', 'Elu', 'Région', 'Département', 'Commune']
df_candidates.head(5)

Unnamed: 0,Nom_complet,Sexe,Nuance,Voix,Pourcentage_Voix,Elu,Région,Département,Commune
0,COQUELET Christophe,MASCULIN,ENS,203,42.65,False,Auvergne-Rhône-Alpes,Ain,L'Abergement-Clémenciat
1,PISANI Florence,FEMININ,UG,93,56.02,False,Auvergne-Rhône-Alpes,Ain,L'Abergement-de-Varey
2,PISANI Florence,FEMININ,UG,2606,48.73,False,Auvergne-Rhône-Alpes,Ain,Ambérieu-en-Bugey
3,COQUELET Christophe,MASCULIN,ENS,388,41.9,False,Auvergne-Rhône-Alpes,Ain,Ambérieux-en-Dombes
4,GIVERNET Olga,FEMININ,ENS,32,54.24,True,Auvergne-Rhône-Alpes,Ain,Ambléon


In [28]:
# transform the 'Elu' column to boolean
df_candidates['Elu'] = df_candidates['Elu'].astype('bool')
# transform the 'Pourcentage_Voix' column to float
df_candidates['Pourcentage_Voix'] = df_candidates['Pourcentage_Voix'].astype('float32')
# transform the 'Voix' column to int
df_candidates['Voix'] = df_candidates['Voix'].astype('int32')
# display the info of the dataset
df_candidates.info()
# display the shape of the dataset
df_candidates.shape
# display the dataset
df_candidates.head(5)

<class 'pandas.core.frame.DataFrame'>
Index: 66472 entries, 0 to 559668
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Nom_complet       66472 non-null  object 
 1   Sexe              66472 non-null  object 
 2   Nuance            66472 non-null  object 
 3   Voix              66472 non-null  int32  
 4   Pourcentage_Voix  66472 non-null  float32
 5   Elu               66472 non-null  bool   
 6   Région            66472 non-null  object 
 7   Département       66472 non-null  object 
 8   Commune           66472 non-null  object 
dtypes: bool(1), float32(1), int32(1), object(6)
memory usage: 4.1+ MB


Unnamed: 0,Nom_complet,Sexe,Nuance,Voix,Pourcentage_Voix,Elu,Région,Département,Commune
0,COQUELET Christophe,MASCULIN,ENS,203,42.650002,True,Auvergne-Rhône-Alpes,Ain,L'Abergement-Clémenciat
1,PISANI Florence,FEMININ,UG,93,56.02,True,Auvergne-Rhône-Alpes,Ain,L'Abergement-de-Varey
2,PISANI Florence,FEMININ,UG,2606,48.73,True,Auvergne-Rhône-Alpes,Ain,Ambérieu-en-Bugey
3,COQUELET Christophe,MASCULIN,ENS,388,41.900002,True,Auvergne-Rhône-Alpes,Ain,Ambérieux-en-Dombes
4,GIVERNET Olga,FEMININ,ENS,32,54.240002,True,Auvergne-Rhône-Alpes,Ain,Ambléon


In [49]:
# Add the tendency of the candidates
# Create a mapping for the tendency of all the parties from Gauche Radicale to Extrême Droite
tendency_mapping = {
    "ENS": "Centre-Droite",
    "UG": "Gauche",
    "RN": "Extrême-Droite",
    "LR": "Droite",
    "UXD": "Extrême-Droite",
    "DVD": "Droite",
    "HOR": "Centre-Droite",
    "DVC": "Centre",
    "REG": "Autres",
    "EXD": "Extrême-Droite",
    "DIV": "Autres",
    "ECO": "Gauche-Radicale",
    "DVG": "Gauche",
    "UDI": "Droite",
    "SOC": "Gauche",
    "DSV": "Extrême-Droite",
    "FI": "Gauche-Radicale"  
}

# Create the new Region column with mapping
df_candidates['Tendency'] = df_candidates['Nuance'].map(tendency_mapping)

In [51]:
# save the dataset to a csv file
df_candidates.to_csv("../app/candidates.csv", sep=";", header=True, index=False)

In [50]:
df_candidates.head()

Unnamed: 0,Nom_complet,Sexe,Nuance,Voix,Pourcentage_Voix,Elu,Région,Département,Commune,Tendency
0,COQUELET Christophe,MASCULIN,ENS,203,42.650002,True,Auvergne-Rhône-Alpes,Ain,L'Abergement-Clémenciat,Centre-Droite
1,PISANI Florence,FEMININ,UG,93,56.02,True,Auvergne-Rhône-Alpes,Ain,L'Abergement-de-Varey,Gauche
2,PISANI Florence,FEMININ,UG,2606,48.73,True,Auvergne-Rhône-Alpes,Ain,Ambérieu-en-Bugey,Gauche
3,COQUELET Christophe,MASCULIN,ENS,388,41.900002,True,Auvergne-Rhône-Alpes,Ain,Ambérieux-en-Dombes,Centre-Droite
4,GIVERNET Olga,FEMININ,ENS,32,54.240002,True,Auvergne-Rhône-Alpes,Ain,Ambléon,Centre-Droite


In [75]:
# count the number of votes for each political parties
count_votes_per_party = df_candidates.groupby(['Tendency', 'Nuance'])['Voix'].sum()
count_votes_per_party.head(count_votes_per_party.shape[0])
# create a dataframe with the count of votes for each political parties
votes_df = pd.DataFrame(count_votes_per_party).reset_index()
# display the dataframe
votes_df.head(votes_df.shape[0])


Unnamed: 0,Tendency,Nuance,Voix
0,Autres,DIV,38025
1,Autres,REG,288202
2,Centre,DVC,177167
3,Centre-Droite,ENS,6313808
4,Centre-Droite,HOR,258139
5,Droite,DVD,980818
6,Droite,LR,1474650
7,Droite,UDI,119672
8,Extrême-Droite,DSV,18672
9,Extrême-Droite,EXD,23217


In [76]:
# add all the counts 
total_votes = count_votes_per_party.sum()
print(f"Total number of votes: {total_votes}")

Total number of votes: 27279714


In [78]:
# calculate the percentage of votes for each political parties
count_votes_per_tendency = df_candidates.groupby(['Tendency'])['Voix'].sum()
count_votes_per_tendency.apply(lambda x: x / total_votes * 100)
count_votes_per_tendency.head(count_votes_per_party.shape[0])

Tendency
Autres               326227
Centre               177167
Centre-Droite       6571947
Droite              2575140
Extrême-Droite     10150933
Gauche              7432131
Gauche-Radicale       46169
Name: Voix, dtype: int32

In [90]:
# plot the number of votes for each political parties with plotly
palette_nuances = {
                    'Gauche-Radicale': '#E4572E',
                    'Gauche': '#FE4A49', 
                    "Centre": "#B1EDE8", 
                    "Centre-Droite": "#4EA5FF",
                    "Droite": "#207BFF",
                    "Extrême-Droite": "#053C5E",
                    "Autres": "#773344",  
                }
tendencies_plot_order = ['Gauche-Radicale','Gauche', 'Centre', 'Centre-Droite', 'Droite', 'Extrême-Droite', 'Autres']
        

# Create the Plotly bar chart 
fig = px.bar(votes_df,
    x='Nuance',                 
    y='Voix',             
    color='Tendency',
    color_discrete_map=palette_nuances,
    title="Distribution of Votes by Political Tendency", 
    labels={'x': 'Political Tendency', 'y': 'Number of Votes'},
    category_orders={"Tendency": tendencies_plot_order}
)

fig.update_layout(
    xaxis_title="Political Tendency",
    yaxis_title="Percentage of Votes (%)",
    showlegend=True,
    plot_bgcolor='white',
    legend_title_text='Tendencies'
)

fig.update_traces(texttemplate='%{y:.0f}', textposition='outside') 
fig.show()

fig = px.pie(
            count_votes_per_tendency, 
            values='Voix', 
            names=count_votes_per_tendency.index,
            color=count_votes_per_tendency.index,
            color_discrete_map=palette_nuances,  
            hole=0.5, 
            title='Distribution of Political Tendencies' ,
            category_orders={"Tendency": tendencies_plot_order}
)
fig.show()


In [None]:
fig = px.bar(
    count_votes_per_party,
    x=count_votes_per_party.index,
    y=count_votes_per_party.values,
    text=count_votes_per_party.values.round(2),
    labels={'x': 'Political Tendency', 'y': 'Percentage of Votes (%)'},
    title="Vote Percentage by Political Tendency",

)
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(yaxis_ticksuffix="%")
st.plotly_chart(fig)

# 2. Optional: Pie Chart with Custom Colors and Order
fig_pie = px.pie(
    values=count_votes_per_party.values,
    names=count_votes_per_party.index,
    title='Overall Vote Share by Political Tendency',
    color_discrete_sequence=list(palette_nuances.values())  # Apply colors to pie slices
)
st.plotly_chart(fig_pie)