In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from IPython.display import display, HTML

In [62]:
# Function to create scrollable table within a small window
def create_scrollable_table(df, table_id, title):
    html = f'<h3>{title}</h3>'
    html += f'<div id="{table_id}" style="height:500px; overflow:auto;">'
    html += df.to_html()
    html += '</div>'
    return html

In [63]:
def display_missing_values(df, column_name):
    """
    Affiche le nombre de valeurs manquantes pour une colonne spécifique d'un DataFrame 
    et retourne un tableau HTML des lignes avec des valeurs manquantes.
    
    Args:
    - df (DataFrame): Le DataFrame à vérifier.
    - column_name (str): Le nom de la colonne à vérifier.
    
    Returns:
    - str: Tableau HTML des lignes avec des valeurs manquantes.
    """
    print(f'Nombre de lignes manquantes pour {column_name} : {df[column_name].isna().value_counts()}')
    
    missing_values = df.loc[df[column_name].isna()]
    missing_values_table = create_scrollable_table(missing_values, f'Missing {column_name} values', f'Missing {column_name} values')
    
    display(HTML(missing_values_table))

In [64]:
df = pd.read_csv('../data/world-happiness-report.csv')
html_values = create_scrollable_table(df.head(50), 'Dataset', 'Dataset')
display(HTML(html_values))

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268
5,Afghanistan,2013,3.572,7.725,0.484,52.56,0.578,0.061,0.823,0.621,0.273
6,Afghanistan,2014,3.131,7.718,0.526,52.88,0.509,0.104,0.871,0.532,0.375
7,Afghanistan,2015,3.983,7.702,0.529,53.2,0.389,0.08,0.881,0.554,0.339
8,Afghanistan,2016,4.22,7.697,0.559,53.0,0.523,0.042,0.793,0.565,0.348
9,Afghanistan,2017,2.662,7.697,0.491,52.8,0.427,-0.121,0.954,0.496,0.371


In [65]:
print(f'Nombre de pays dans le dataset : {df["Country name"].nunique()}')

Nombre de pays dans le dataset : 166


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      1949 non-null   object 
 1   year                              1949 non-null   int64  
 2   Life Ladder                       1949 non-null   float64
 3   Log GDP per capita                1913 non-null   float64
 4   Social support                    1936 non-null   float64
 5   Healthy life expectancy at birth  1894 non-null   float64
 6   Freedom to make life choices      1917 non-null   float64
 7   Generosity                        1860 non-null   float64
 8   Perceptions of corruption         1839 non-null   float64
 9   Positive affect                   1927 non-null   float64
 10  Negative affect                   1933 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 167.6+ KB


In [67]:
# PIB
missing_gdp_table = display_missing_values(df, "Log GDP per capita")
missing_gdp_table

Nombre de lignes manquantes pour Log GDP per capita : False    1913
True       36
Name: Log GDP per capita, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
423,Cuba,2006,5.418,,0.97,68.44,0.281,,,0.647,0.277
436,Cyprus,2020,6.26,,0.806,74.1,0.763,,0.816,0.759,0.284
709,Hong Kong S.A.R. of China,2020,5.295,,0.813,,0.705,,0.38,0.609,0.21
771,Iran,2018,4.278,,0.674,66.0,0.603,,0.703,0.553,0.493
772,Iran,2019,5.006,,0.698,66.3,0.623,,0.728,0.6,0.449
773,Iran,2020,4.865,,0.757,66.6,0.6,,0.71,0.582,0.47
918,Kosovo,2020,6.294,,0.792,,0.88,,0.91,0.726,0.201
1087,Malta,2020,6.157,,0.938,72.2,0.931,,0.675,0.601,0.411
1280,North Cyprus,2012,5.463,,0.871,,0.693,,0.855,0.709,0.405
1281,North Cyprus,2013,5.567,,0.869,,0.775,,0.715,0.622,0.443


In [None]:
Remplacement par la médiane

In [68]:
# Social Support
missing_socsup_table = display_missing_values(df, "Social support")
missing_socsup_table

Nombre de lignes manquantes pour Social support : False    1936
True       13
Name: Social support, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
25,Algeria,2010,5.464,9.287,,64.5,0.593,-0.205,0.618,,
112,Bahrain,2014,6.165,10.783,,67.42,,,,,
294,Canada,2007,7.482,10.739,,71.66,0.93,0.249,0.406,0.872,0.257
466,Djibouti,2010,5.006,7.812,,54.3,0.764,-0.058,0.597,,
925,Kuwait,2014,6.18,10.945,,65.8,,,,,
1164,Morocco,2010,4.383,8.746,,63.5,0.663,-0.162,0.9,,
1310,Oman,2011,6.853,10.382,,65.5,0.916,0.025,,,0.295
1423,Qatar,2010,6.85,11.52,,66.7,,0.104,,,
1426,Qatar,2015,6.375,11.486,,68.3,,,,,
1741,Tunisia,2009,5.025,9.197,,64.96,0.781,-0.119,0.722,,


In [69]:
# Healthy life expectancy at birth
missing_lifex_table = display_missing_values(df, "Healthy life expectancy at birth")
missing_lifex_table

Nombre de lignes manquantes pour Healthy life expectancy at birth : False    1894
True       55
Name: Healthy life expectancy at birth, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
699,Hong Kong S.A.R. of China,2006,5.511,10.746,0.812,,0.91,0.156,0.356,0.723,0.236
700,Hong Kong S.A.R. of China,2008,5.137,10.816,0.84,,0.922,0.296,0.274,0.719,0.237
701,Hong Kong S.A.R. of China,2009,5.397,10.788,0.835,,0.918,0.308,0.272,0.762,0.21
702,Hong Kong S.A.R. of China,2010,5.643,10.847,0.857,,0.89,0.332,0.256,0.71,0.183
703,Hong Kong S.A.R. of China,2011,5.474,10.887,0.846,,0.894,0.235,0.245,0.734,0.196
704,Hong Kong S.A.R. of China,2012,5.484,10.893,0.826,,0.88,0.222,0.38,0.715,0.183
705,Hong Kong S.A.R. of China,2014,5.458,10.94,0.834,,0.843,0.224,0.423,0.684,0.243
706,Hong Kong S.A.R. of China,2016,5.498,10.97,0.832,,0.8,0.1,0.403,0.664,0.213
707,Hong Kong S.A.R. of China,2017,5.362,11.0,0.831,,0.831,0.14,0.416,0.64,0.201
708,Hong Kong S.A.R. of China,2019,5.659,11.0,0.856,,0.727,0.067,0.432,0.599,0.358


In [70]:
# Freedom to make life choices
missing_freedom_table = display_missing_values(df, "Freedom to make life choices")
missing_freedom_table

Nombre de lignes manquantes pour Freedom to make life choices : False    1917
True       32
Name: Freedom to make life choices, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
28,Algeria,2014,6.355,9.335,0.818,65.14,,,,0.626,0.177
29,Algeria,2016,5.341,9.362,0.749,65.5,,,,0.661,0.377
112,Bahrain,2014,6.165,10.783,,67.42,,,,,
263,Cambodia,2006,3.569,7.746,0.793,55.3,,0.255,0.829,0.719,0.341
342,China,2006,4.56,8.696,0.747,66.88,,,,0.809,0.17
343,China,2007,4.863,8.824,0.811,67.06,,-0.176,,0.817,0.159
350,China,2014,5.196,9.386,0.82,68.24,,-0.217,,0.854,0.112
351,China,2015,5.304,9.449,0.794,68.4,,-0.244,,0.809,0.171
352,China,2016,5.325,9.51,0.742,68.7,,-0.228,,0.826,0.146
500,Egypt,2008,4.632,9.186,0.738,59.88,,-0.087,0.914,0.683,0.301


In [None]:
# Freedom to make life choices
missing_freedom_table = display_missing_values(df, "Freedom to make life choices")
missing_freedom_table

In [72]:
# Generosity
missing_generosity_table = display_missing_values(df, "Generosity")
missing_generosity_table

Nombre de lignes manquantes pour Generosity : False    1860
True       89
Name: Generosity, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
28,Algeria,2014,6.355,9.335,0.818,65.14,,,,0.626,0.177
29,Algeria,2016,5.341,9.362,0.749,65.5,,,,0.661,0.377
66,Australia,2005,7.341,10.659,0.968,71.4,0.935,,0.39,0.843,0.238
110,Bahrain,2012,5.027,10.716,0.911,66.86,0.682,,0.438,0.589,0.381
111,Bahrain,2013,6.69,10.757,0.884,67.14,0.809,,0.525,0.768,0.306
112,Bahrain,2014,6.165,10.783,,67.42,,,,,
147,Belgium,2005,7.262,10.745,0.935,69.9,0.924,,0.598,0.796,0.26
218,Brazil,2005,6.637,9.438,0.883,63.3,0.882,,0.745,0.818,0.302
224,Brazil,2012,6.66,9.647,0.89,65.02,0.849,,0.623,0.755,0.35
342,China,2006,4.56,8.696,0.747,66.88,,,,0.809,0.17


In [73]:
# Perceptions of corruption
missing_corruption_table = display_missing_values(df, "Perceptions of corruption")
missing_corruption_table

Nombre de lignes manquantes pour Perceptions of corruption : False    1839
True      110
Name: Perceptions of corruption, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
28,Algeria,2014,6.355,9.335,0.818,65.14,,,,0.626,0.177
29,Algeria,2016,5.341,9.362,0.749,65.5,,,,0.661,0.377
112,Bahrain,2014,6.165,10.783,,67.42,,,,,
113,Bahrain,2015,6.007,10.785,0.853,67.7,0.85,0.112,,0.716,0.303
114,Bahrain,2016,6.17,10.781,0.863,68.1,0.889,0.088,,0.787,0.283
115,Bahrain,2017,6.227,10.771,0.876,68.5,0.906,0.136,,0.814,0.29
116,Bahrain,2019,7.098,10.715,0.878,69.3,0.907,0.048,,0.762,0.317
117,Bahrain,2020,6.173,10.62,0.848,69.7,0.945,0.132,,0.79,0.297
275,Cambodia,2018,5.122,8.333,0.795,61.6,0.958,0.036,,0.845,0.414
342,China,2006,4.56,8.696,0.747,66.88,,,,0.809,0.17


In [74]:
# Positive affect
missing_positive_table = display_missing_values(df, "Positive affect")
missing_positive_table

Nombre de lignes manquantes pour Positive affect : False    1927
True       22
Name: Positive affect, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
25,Algeria,2010,5.464,9.287,,64.5,0.593,-0.205,0.618,,
112,Bahrain,2014,6.165,10.783,,67.42,,,,,
126,Bangladesh,2014,4.636,8.164,0.577,62.38,0.736,-0.098,0.789,,0.231
135,Belarus,2008,5.463,9.677,0.904,61.7,0.64,-0.22,0.696,,0.246
264,Cambodia,2007,4.156,7.829,0.675,56.1,0.819,0.116,0.879,,0.32
466,Djibouti,2010,5.006,7.812,,54.3,0.764,-0.058,0.597,,
779,Iraq,2013,4.725,9.28,0.728,58.88,,-0.05,0.71,,0.554
872,Jordan,2018,4.639,9.196,0.8,66.8,0.762,-0.186,,,
873,Jordan,2019,4.453,9.201,0.793,67.0,0.726,-0.165,,,
874,Jordan,2020,4.094,9.15,0.709,67.2,0.779,-0.15,,,


In [75]:
# Negative affect
missing_negative_table = display_missing_values(df, "Negative affect")
missing_negative_table

Nombre de lignes manquantes pour Negative affect : False    1933
True       16
Name: Negative affect, dtype: int64


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
25,Algeria,2010,5.464,9.287,,64.5,0.593,-0.205,0.618,,
112,Bahrain,2014,6.165,10.783,,67.42,,,,,
466,Djibouti,2010,5.006,7.812,,54.3,0.764,-0.058,0.597,,
872,Jordan,2018,4.639,9.196,0.8,66.8,0.762,-0.186,,,
873,Jordan,2019,4.453,9.201,0.793,67.0,0.726,-0.165,,,
874,Jordan,2020,4.094,9.15,0.709,67.2,0.779,-0.15,,,
925,Kuwait,2014,6.18,10.945,,65.8,,,,,
1062,Maldives,2018,5.198,9.826,0.913,70.6,0.855,0.024,,,
1164,Morocco,2010,4.383,8.746,,63.5,0.663,-0.162,0.9,,
1380,Philippines,2006,4.67,8.562,0.795,59.8,0.828,0.063,0.841,0.832,


In [87]:
print(f'Nombre de lignes manquantes total : {df[df.isna().any(axis=1)].shape[0]}')
    
missing_values = df[df.isna().any(axis=1)]
missing_values_table = create_scrollable_table(missing_values, f'Total missing values', f'Total missing values')
display(HTML(missing_values_table))

Nombre de lignes manquantes total : 241


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
25,Algeria,2010,5.464,9.287,,64.5,0.593,-0.205,0.618,,
28,Algeria,2014,6.355,9.335,0.818,65.14,,,,0.626,0.177
29,Algeria,2016,5.341,9.362,0.749,65.5,,,,0.661,0.377
66,Australia,2005,7.341,10.659,0.968,71.4,0.935,,0.39,0.843,0.238
110,Bahrain,2012,5.027,10.716,0.911,66.86,0.682,,0.438,0.589,0.381
111,Bahrain,2013,6.69,10.757,0.884,67.14,0.809,,0.525,0.768,0.306
112,Bahrain,2014,6.165,10.783,,67.42,,,,,
113,Bahrain,2015,6.007,10.785,0.853,67.7,0.85,0.112,,0.716,0.303
114,Bahrain,2016,6.17,10.781,0.863,68.1,0.889,0.088,,0.787,0.283
115,Bahrain,2017,6.227,10.771,0.876,68.5,0.906,0.136,,0.814,0.29


In [108]:
 pd.set_option('display.max_rows', None)
# pd.set_option('display.min_rows', None)

# pd.reset_option('display.max_rows')
# pd.reset_option('display.min_rows')

In [109]:
print(f'Nombre de lignes manquantes total : {df[df.isna().any(axis=1)].shape[0]}')

# Compte le nombre de lignes avec des valeurs manquantes pour chaque pays
num_missing_per_country = missing_values.groupby('Country name').size()

# Trie le nombre de lignes avec des valeurs manquantes pour chaque pays par ordre décroissant
sorted_num_rows_with_na_per_country = num_missing_per_country.sort_values(ascending=False)

print(sorted_num_rows_with_na_per_country)

Nombre de lignes manquantes total : 241
Country name
China                        15
Kosovo                       14
Jordan                       13
Saudi Arabia                 13
Hong Kong S.A.R. of China    11
United Arab Emirates         10
Turkmenistan                 10
Taiwan Province of China     10
Palestinian Territories      10
Bahrain                       8
Kuwait                        8
North Cyprus                  7
Egypt                         7
Malta                         6
Yemen                         5
Iran                          5
Qatar                         4
South Sudan                   4
Uzbekistan                    4
Venezuela                     4
Vietnam                       4
Somaliland region             4
Somalia                       3
Tajikistan                    3
Algeria                       3
Cambodia                      3
Germany                       2
Japan                         2
United States                 2
Brazil             

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      1949 non-null   object 
 1   year                              1949 non-null   int64  
 2   Life Ladder                       1949 non-null   float64
 3   Log GDP per capita                1913 non-null   float64
 4   Social support                    1936 non-null   float64
 5   Healthy life expectancy at birth  1894 non-null   float64
 6   Freedom to make life choices      1917 non-null   float64
 7   Generosity                        1860 non-null   float64
 8   Perceptions of corruption         1839 non-null   float64
 9   Positive affect                   1927 non-null   float64
 10  Negative affect                   1933 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 167.6+ KB
