In [5]:
import pandas as pd

# Paths to CSV files

In [6]:
file_paths = {
    "world_religious": "CSVs/world_religious.csv",
    "world_population_and_health": "CSVs/world_population_and_health.csv",
    "world_happiness_report_2024": "CSVs/world_happiness_report_2024.csv",
    "hdr_general": "CSVs/hdr_general.csv",
    "gdp": "CSVs/gdp.csv",
    "CPI2023_global_results_trends": "CSVs/CPI2023_global_results_trends.csv",
    "civil_liberties_score_fh_new": "CSVs/civil_liberties_score_fh_new.csv"
}

In [10]:
import os
import pandas as pd

# Dictionnaire des chemins d'origine
file_paths = {
    "world_religious": "CSVs/world_religious.csv",
    "world_population_and_health": "CSVs/world_population_and_health.csv",
    "world_happiness_report_2024": "CSVs/world_happiness_report_2024.csv",
    "hdr_general": "CSVs/hdr_general.csv",
    "gdp": "CSVs/gdp.csv",
    "CPI2023_global_results_trends": "CSVs/CPI2023_global_results_trends.csv",
    "civil_liberties_score_fh_new": "CSVs/civil_liberties_score_fh_new.csv"
}

# Dossier de sortie pour les CSV nettoyés
output_folder = "CSVs_cleaned"
os.makedirs(output_folder, exist_ok=True)

# Fonction de chargement avec gestion d'encodage
def try_loading_csv(name, path, **kwargs):
    try:
        df = pd.read_csv(path, **kwargs)
        print(f"Chargement de {name} réussi avec utf-8.")
        return df
    except UnicodeDecodeError as e:
        print(f"Erreur d'encodage pour {name} avec utf-8 : {e}")
        try:
            df = pd.read_csv(path, encoding='latin-1', **kwargs)
            print(f"Chargement de {name} réussi avec latin-1.")
            return df
        except Exception as e2:
            print(f"Échec du chargement de {name} avec latin-1 : {e2}")
            return None

# Dictionnaire pour stocker les DataFrames chargés
dataframes = {}

# Chargement des datasets
for name, path in file_paths.items():
    kwargs = {}
    if name == "CPI2023_global_results_trends":
        kwargs['sep'] = "|"  # Spécifier le séparateur pour CPI
    df = try_loading_csv(name, path, **kwargs)
    if df is not None:
        dataframes[name] = df
        print(f"--- Dataset : {name} ---")
        print(df.head(), "\n")
    else:
        print(f"Le dataset {name} n'a pas pu être chargé.\n")

# ====================================================
# 1. Nettoyage de world_religious
# ====================================================
df_world_religious = dataframes["world_religious"].copy()
# Renommer "Country or Area" en "Country"
df_world_religious.rename(columns={"Country or Area": "Country"}, inplace=True)
# Supprimer les colonnes inutiles
cols_to_drop_religious = ["Area", "Sex", "Record Type", "Reliability", "Source Year", "Value Footnotes"]
df_world_religious_clean = df_world_religious.drop(columns=cols_to_drop_religious)
# Enregistrer le CSV nettoyé
df_world_religious_clean.to_csv(os.path.join(output_folder, "world_religious_clean.csv"), index=False)
print("world_religious nettoyé et sauvegardé.")

# ====================================================
# 2. Nettoyage de world_population_and_health
# ====================================================
df_population = dataframes["world_population_and_health"].copy()
# Remplacer les valeurs manquantes dans les colonnes numériques par la médiane
numeric_cols = df_population.select_dtypes(include=['float64', 'int64']).columns
df_population_clean = df_population.copy()
df_population_clean[numeric_cols] = df_population_clean[numeric_cols].fillna(df_population_clean[numeric_cols].median())
# Enregistrer le CSV nettoyé
df_population_clean.to_csv(os.path.join(output_folder, "world_population_and_health_clean.csv"), index=False)
print("world_population_and_health nettoyé et sauvegardé.")

# ====================================================
# 3. Nettoyage de world_happiness_report_2024
# ====================================================
df_happiness = dataframes["world_happiness_report_2024"].copy()
# Renommer "Country name" en "Country"
df_happiness.rename(columns={"Country name": "Country"}, inplace=True)
# Supprimer les lignes où "Life Ladder" est manquant
df_happiness_clean = df_happiness.dropna(subset=["Life Ladder"])
# Enregistrer le CSV nettoyé
df_happiness_clean.to_csv(os.path.join(output_folder, "world_happiness_report_2024_clean.csv"), index=False)
print("world_happiness_report_2024 nettoyé et sauvegardé.")

# ====================================================
# 4. Nettoyage de gdp
# ====================================================
df_gdp = dataframes["gdp"].copy()
# Transformation du format large en format long
id_vars = ["Country"]
value_vars = [col for col in df_gdp.columns if col != "Country"]
df_gdp_long = df_gdp.melt(id_vars=id_vars, value_vars=value_vars, var_name="Year", value_name="GDP")
# Conversion de "Year" en numérique
df_gdp_long["Year"] = pd.to_numeric(df_gdp_long["Year"], errors="coerce")
# Enregistrer le CSV nettoyé (format long)
df_gdp_long.to_csv(os.path.join(output_folder, "gdp_long_clean.csv"), index=False)
print("gdp nettoyé (format long) et sauvegardé.")

# ====================================================
# 5. Nettoyage de CPI2023_global_results_trends
# ====================================================
df_cpi = dataframes["CPI2023_global_results_trends"].copy()
# Renommer "Country / Territory" en "Country"
df_cpi.rename(columns={"Country / Territory": "Country"}, inplace=True)
# Supprimer les colonnes inutiles
cols_to_drop_cpi = ["Standard error", "Lower CI", "Upper CI", "Number of sources",
                      "Freedom House Nations in Transit", "PERC Asia Risk Guide"]
df_cpi_clean = df_cpi.drop(columns=cols_to_drop_cpi)
# Enregistrer le CSV nettoyé
df_cpi_clean.to_csv(os.path.join(output_folder, "CPI2023_global_results_trends_clean.csv"), index=False)
print("CPI2023_global_results_trends nettoyé et sauvegardé.")

# ====================================================
# 6. Nettoyage de civil_liberties_score_fh_new
# ====================================================
df_liberties = dataframes["civil_liberties_score_fh_new"].copy()
# Renommer "Entity" en "Country"
df_liberties.rename(columns={"Entity": "Country"}, inplace=True)
# Enregistrer le CSV nettoyé
df_liberties.to_csv(os.path.join(output_folder, "civil_liberties_score_fh_new_clean.csv"), index=False)
print("civil_liberties_score_fh_new nettoyé et sauvegardé.")

# ====================================================
# 7. hdr_general
# ====================================================
# hdr_general est déjà corrigé, on peut l'enregistrer tel quel si besoin.
df_hdr = dataframes["hdr_general"].copy() if "hdr_general" in dataframes else None
if df_hdr is not None:
    df_hdr.to_csv(os.path.join(output_folder, "hdr_general_clean.csv"), index=False)
    print("hdr_general (déjà corrigé) sauvegardé.")
else:
    print("hdr_general n'a pas été chargé, vérifier le fichier d'origine.")

print("Nettoyage terminé et fichiers sauvegardés dans le dossier", output_folder)


Chargement de world_religious réussi avec utf-8.
--- Dataset : world_religious ---
   Country  Year   Area         Sex     Religion  \
0  Albania  2011  Total  Both Sexes        Total   
1  Albania  2011  Total  Both Sexes      Atheist   
2  Albania  2011  Total  Both Sexes     Catholic   
3  Albania  2011  Total  Both Sexes  Evangelical   
4  Albania  2011  Total  Both Sexes       Muslim   

                              Record Type             Reliability  \
0  Census - de jure - complete tabulation  Final figure, complete   
1  Census - de jure - complete tabulation  Final figure, complete   
2  Census - de jure - complete tabulation  Final figure, complete   
3  Census - de jure - complete tabulation  Final figure, complete   
4  Census - de jure - complete tabulation  Final figure, complete   

   Source Year      Value Value Footnotes  
0         2013  2800138.0             NaN  
1         2013    69995.0             NaN  
2         2013   280921.0             NaN  
3         201

In [8]:
import pandas as pd

# Supposons que vous avez déjà chargé vos datasets dans un dictionnaire 'dataframes'
# Exemple : dataframes = {"world_religious": df_world_religious, ...}

# === 1. Nettoyage de world_religious ===
df_world_religious = dataframes["world_religious"]
# Supprimer les colonnes inutiles
cols_to_drop_religious = ["Record Type", "Reliability", "Source Year", "Value Footnotes"]
df_world_religious_clean = df_world_religious.drop(columns=cols_to_drop_religious)
print("world_religious - Colonnes restantes :", df_world_religious_clean.columns)

# === 2. Nettoyage de world_population_and_health ===
df_population = dataframes["world_population_and_health"]
# Pour les colonnes numériques, on remplace les valeurs manquantes par la médiane
numeric_cols = df_population.select_dtypes(include=['float64', 'int64']).columns
df_population_clean = df_population.copy()
df_population_clean[numeric_cols] = df_population_clean[numeric_cols].fillna(df_population_clean[numeric_cols].median())
print("world_population_and_health - Valeurs manquantes après imputation :\n", df_population_clean.isnull().sum())

# === 3. Nettoyage de world_happiness_report_2024 ===
df_happiness = dataframes["world_happiness_report_2024"]
# Suppression des lignes où le score de bonheur (Life Ladder) est manquant (vous pouvez adapter selon le besoin)
df_happiness_clean = df_happiness.dropna(subset=["Life Ladder"])
print("world_happiness_report_2024 - Forme après suppression des lignes sans 'Life Ladder' :", df_happiness_clean.shape)

# === 4. Nettoyage de gdp ===
df_gdp = dataframes["gdp"]
# Restructurer le dataframe : transformation du format large en format long
id_vars = ["Country"]
value_vars = [col for col in df_gdp.columns if col != "Country"]
df_gdp_long = df_gdp.melt(id_vars=id_vars, value_vars=value_vars, var_name="Year", value_name="GDP")
# Conversion de l'année en numérique
df_gdp_long["Year"] = pd.to_numeric(df_gdp_long["Year"], errors="coerce")
print("gdp - Exemple de données restructurées :")
print(df_gdp_long.head())

# === 5. Nettoyage de CPI2023_global_results_trends ===
df_cpi = dataframes["CPI2023_global_results_trends"]
# Renommer la colonne "Country / Territory" en "Country" pour l'uniformité
df_cpi.rename(columns={"Country / Territory": "Country"}, inplace=True)
# Supprimer des colonnes potentiellement inutiles (ajustez la liste selon vos besoins)
cols_to_drop_cpi = ["Standard error", "Lower CI", "Upper CI", "Number of sources"]
df_cpi_clean = df_cpi.drop(columns=cols_to_drop_cpi)
print("CPI2023_global_results_trends - Colonnes restantes :", df_cpi_clean.columns)

# === 6. Nettoyage de civil_liberties_score_fh_new ===
df_liberties = dataframes["civil_liberties_score_fh_new"]
# Renommer "Entity" en "Country" pour uniformiser les noms
df_liberties.rename(columns={"Entity": "Country"}, inplace=True)
print("civil_liberties_score_fh_new - Colonnes :", df_liberties.columns)

# Vous pouvez maintenant utiliser les dataframes nettoyés (df_world_religious_clean, df_population_clean,
# df_happiness_clean, df_gdp_long, df_cpi_clean, df_liberties) pour votre analyse descriptive et les étapes suivantes.


world_religious - Colonnes restantes : Index(['Country or Area', 'Year', 'Area', 'Sex', 'Religion', 'Value'], dtype='object')
world_population_and_health - Valeurs manquantes après imputation :
 Country                  0
Year                     0
Population               0
country_code             0
health_exp               0
life_expect              0
maternal_mortality       0
infant_mortality         0
neonatal_mortality       0
under_5_mortality        0
prev_hiv                 0
inci_tuberc              0
prev_undernourishment    0
dtype: int64
world_happiness_report_2024 - Forme après suppression des lignes sans 'Life Ladder' : (2363, 11)
gdp - Exemple de données restructurées :
       Country  Year       GDP
0  Afghanistan  1980     0.000
1      Albania  1980   728.359
2      Algeria  1980  2268.607
3      Andorra  1980     0.000
4       Angola  1980   802.627
CPI2023_global_results_trends - Colonnes restantes : Index(['Country', 'ISO3', 'Region', 'CPI score 2023', 'Rank',
  

In [9]:
df_world_religious_clean = df_world_religious_clean.drop(columns=["Area", "Sex"])