In [89]:
import altair as alt
import pandas as pd
import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas
import toolz

import os

def custom(data):
    return toolz.curried.pipe(data, alt.to_json(filename='altdata/{prefix}-{hash}.{extension}') )
alt.data_transformers.register('custom', custom)
alt.data_transformers.enable('custom')

pass

In [90]:
df = pd.read_csv("dpt2020.csv", sep=";")
df.drop(df[df.preusuel == '_PRENOMS_RARES'].index, inplace=True)
df.drop(df[df.dpt == 'XX'].index, inplace=True)
df['annais'] = df['annais'].astype(int)
df['nombre'] = df['nombre'].astype(int)
df['sexe'] = df['sexe'].replace({1: 'M', 2: 'F'})
df = df.rename(columns={"annais": "annee"})

In [91]:
periode_debut = 2010
periode_fin = 2015
df_filtered = df[(df['annee'] >= periode_debut) & (df['annee'] <= periode_fin)]

df_total = df_filtered.groupby(['preusuel', 'sexe'], as_index=False)['nombre'].sum()

# on considère le top 250, pour éviter les prenoms donnés une seule fois par exemple
top_250_boys = df_total[df_total['sexe'] == 'M'].nlargest(250, 'nombre')
top_250_girls = df_total[df_total['sexe'] == 'F'].nlargest(250, 'nombre')
filtered_names = pd.concat([top_250_boys, top_250_girls])['preusuel'].unique()
df_filtered = df_filtered[df_filtered['preusuel'].isin(filtered_names)]

df_grouped = df_filtered.groupby(['preusuel', 'sexe', 'annee'], as_index=False)['nombre'].sum()

# nombre moyen d'occurence sur la période
df_mean = df_grouped.groupby(['preusuel', 'sexe'], as_index=False)['nombre'].mean().rename(columns={'nombre': 'nombre_moyen'})

# variation annuelle pour chaque prénom
df_grouped['variation'] = df_grouped.groupby(['preusuel', 'sexe'])['nombre'].diff()
df_grouped['variation_absolue'] = df_grouped['variation'].abs()


df_grouped = pd.merge(df_grouped, df_mean, on=['preusuel', 'sexe'])   # Fusionner avec le nombre moyen d'occurrences

# on normalie la variation absolue par le nombre moyen d'occurrences
df_grouped['variation_normalisee'] = df_grouped['variation_absolue'] / df_grouped['nombre_moyen'] 

df_aggregated = df_grouped.groupby(['preusuel', 'sexe'], as_index=False)['variation_normalisee'].mean()

top_10_boys = df_aggregated[df_aggregated['sexe'] == 'M'].nsmallest(10, 'variation_normalisee')
top_10_girls = df_aggregated[df_aggregated['sexe'] == 'F'].nsmallest(10, 'variation_normalisee')

top_10_boys = top_10_boys.sort_values(by='variation_normalisee', ascending=False)
top_10_girls = top_10_girls.sort_values(by='variation_normalisee', ascending=False)


alternated_list = []
for i in range(max(len(top_10_boys), len(top_10_girls))):
    if i < len(top_10_boys):
        alternated_list.append(top_10_boys.iloc[i])
    if i < len(top_10_girls):
        alternated_list.append(top_10_girls.iloc[i])



df_alternated = pd.DataFrame(alternated_list)


chart = alt.Chart(df_alternated).mark_bar().encode(
    x=alt.X('preusuel:N', title='Prénom', sort=None),
    y=alt.Y('variation_normalisee:Q', title='Variation Normalisée'),
    color=alt.Color('sexe:N', scale=alt.Scale(domain=['M', 'F'], range=['blue', 'pink']), title='Sexe'),
    xOffset=alt.XOffset("sexe:N")
).properties(
    width=800
)

chart

In [92]:
periode_debut = 2010
periode_fin = 2015
df_filtered = df[(df['annee'] >= periode_debut) & (df['annee'] <= periode_fin)]

df_total = df_filtered.groupby(['preusuel', 'sexe'], as_index=False)['nombre'].sum()

bottom_250_boys = df_total[df_total['sexe'] == 'M'].nsmallest(250, 'nombre')
bottom_250_girls = df_total[df_total['sexe'] == 'F'].nsmallest(250, 'nombre')
filtered_names = pd.concat([bottom_250_boys, bottom_250_girls])['preusuel'].unique()
df_filtered = df_filtered[df_filtered['preusuel'].isin(filtered_names)]

df_grouped = df_filtered.groupby(['preusuel', 'sexe', 'annee'], as_index=False)['nombre'].sum()

df_mean = df_grouped.groupby(['preusuel', 'sexe'], as_index=False)['nombre'].mean().rename(columns={'nombre': 'nombre_moyen'})

df_grouped['variation'] = df_grouped.groupby(['preusuel', 'sexe'])['nombre'].diff()
df_grouped['variation_absolue'] = df_grouped['variation'].abs()

df_grouped = pd.merge(df_grouped, df_mean, on=['preusuel', 'sexe'])

df_grouped['variation_normalisee'] = df_grouped['variation_absolue'] / df_grouped['nombre_moyen']

df_grouped_non_null = df_grouped[df_grouped['variation_normalisee'] != 0]

df_aggregated = df_grouped_non_null.groupby(['preusuel', 'sexe'], as_index=False)['variation_normalisee'].mean()

top_10_boys = df_aggregated[df_aggregated['sexe'] == 'M'].nsmallest(10, 'variation_normalisee')
top_10_girls = df_aggregated[df_aggregated['sexe'] == 'F'].nsmallest(10, 'variation_normalisee')

top_10_boys = top_10_boys.sort_values(by='variation_normalisee', ascending=False)
top_10_girls = top_10_girls.sort_values(by='variation_normalisee', ascending=False)

alternated_list = []
for i in range(max(len(top_10_boys), len(top_10_girls))):
    if i < len(top_10_boys):
        alternated_list.append(top_10_boys.iloc[i])
    if i < len(top_10_girls):
        alternated_list.append(top_10_girls.iloc[i])

df_alternated2 = pd.DataFrame(alternated_list)

chart = alt.Chart(df_alternated2).mark_bar().encode(
    x=alt.X('preusuel:N', title='Prénom', sort=None),
    y=alt.Y('variation_normalisee:Q', title='Variation Normalisée'),
    color=alt.Color('sexe:N', scale=alt.Scale(domain=['M', 'F'], range=['blue', 'pink']), title='Sexe'),
    xOffset=alt.XOffset("sexe:N")
).properties(
    width=800
)

chart


In [93]:
max_variation_top = df_alternated['variation_normalisee'].max()
max_variation_bottom = df_alternated2['variation_normalisee'].max()

scale_factor_top = max(max_variation_top, max_variation_bottom) / max_variation_top
scale_factor_bottom = max(max_variation_top, max_variation_bottom) / max_variation_bottom

# on met les deux à la même échelle
df_alternated['variation_normalisee'] *= scale_factor_top
df_alternated2['variation_normalisee'] *= scale_factor_bottom

# inverse la partie droite
df_alternated2 = df_alternated2.iloc[::-1]

space = pd.DataFrame([{'preusuel': ' ', 'sexe': '', 'variation_normalisee': 0}] * 5)

df_combined = pd.concat([df_alternated, space, df_alternated2])

# reste à mettre les légendes
chart = alt.Chart(df_combined).mark_bar().encode(
    x=alt.X('preusuel:N', title='Prénom', sort=None),
    y=alt.Y('variation_normalisee:Q', title='Variation Normalisée'),
    color=alt.Color('sexe:N', scale=alt.Scale(domain=['M', 'F'], range=['blue', 'pink']), title='Sexe'),
    xOffset=alt.XOffset("sexe:N")
).properties(
    width=800
)

chart
