In [None]:
!pip install wbdata




In [3]:
import wbdata
import pandas as pd
import datetime

In [11]:
import pandas as pd
import zipfile
import io
import requests

# URL du fichier zip World Bank
url = "http://api.worldbank.org/v2/en/indicator/SE.ADT.LITR.FE.ZS?downloadformat=csv"

# Télécharger le zip
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

# Lister les fichiers contenus
print(z.namelist())

# Généralement, le fichier principal commence par "API_"
data_file = [f for f in z.namelist() if f.startswith("API_")][0]

# Charger uniquement ce fichier
df = pd.read_csv(z.open(data_file), header=2)

print(df.head())


['Metadata_Indicator_API_SE.ADT.LITR.FE.ZS_DS2_en_csv_v2_973736.csv', 'API_SE.ADT.LITR.FE.ZS_DS2_en_csv_v2_973736.csv', 'Metadata_Country_API_SE.ADT.LITR.FE.ZS_DS2_en_csv_v2_973736.csv']
                  Country Name Country Code  \
0                        Aruba          ABW   
1  Africa Eastern and Southern          AFE   
2                  Afghanistan          AFG   
3   Africa Western and Central          AFW   
4                       Angola          AGO   

                                      Indicator Name     Indicator Code  1960  \
0  Literacy rate, adult female (% of females ages...  SE.ADT.LITR.FE.ZS   NaN   
1  Literacy rate, adult female (% of females ages...  SE.ADT.LITR.FE.ZS   NaN   
2  Literacy rate, adult female (% of females ages...  SE.ADT.LITR.FE.ZS   NaN   
3  Literacy rate, adult female (% of females ages...  SE.ADT.LITR.FE.ZS   NaN   
4  Literacy rate, adult female (% of females ages...  SE.ADT.LITR.FE.ZS   NaN   

   1961  1962  1963  1964  1965  ...       

In [None]:
# Garder seulement Country Name, Country Code et les années
df_clean = df.drop(columns=["Indicator Name", "Indicator Code", "Unnamed: 69"], errors="ignore")

# Transformer les colonnes années en lignes (melt)
df_long = df_clean.melt(id_vars=["Country Name", "Country Code"],
                        var_name="Year", value_name="Literacy_Female_Adult")

# Conversion de Year en entier
df_long["Year"] = pd.to_numeric(df_long["Year"], errors="coerce")

# Liste simplifiée de pays africains
african_countries = [
    "DZA","AGO","BEN","BWA","BFA","BDI","CMR","CPV","CAF","TCD","COM","COG","CIV",
    "COD","DJI","EGY","GNQ","ERI","SWZ","ETH","GAB","GMB","GHA","GIN","GNB","KEN",
    "LSO","LBR","LBY","MDG","MWI","MLI","MRT","MUS","MAR","MOZ","NAM","NER","NGA",
    "RWA","STP","SEN","SYC","SLE","ZAF","SSD","SDN","TZA","TGO","TUN","UGA","ZMB","ZWE"
]

# Filtrer uniquement l’Afrique
df_africa = df_long[df_long["Country Code"].isin(african_countries)]

print(df_africa.head(10))


                Country Name Country Code  Year  Literacy_Female_Adult
4                     Angola          AGO  1960                    NaN
16                   Burundi          BDI  1960                    NaN
18                     Benin          BEN  1960                    NaN
19              Burkina Faso          BFA  1960                    NaN
33                  Botswana          BWA  1960                    NaN
34  Central African Republic          CAF  1960                    NaN
41             Cote d'Ivoire          CIV  1960                    NaN
42                  Cameroon          CMR  1960                    NaN
43          Congo, Dem. Rep.          COD  1960                    NaN
44               Congo, Rep.          COG  1960                    NaN


In [13]:
df_africa_recent = df_africa[df_africa["Year"] >= 2000]
print(df_africa_recent.head(20))


                   Country Name Country Code  Year  Literacy_Female_Adult
10644                    Angola          AGO  2000                    NaN
10656                   Burundi          BDI  2000              52.000000
10658                     Benin          BEN  2000                    NaN
10659              Burkina Faso          BFA  2000                    NaN
10673                  Botswana          BWA  2000                    NaN
10674  Central African Republic          CAF  2000              35.000000
10681             Cote d'Ivoire          CIV  2000              39.000000
10682                  Cameroon          CMR  2000              59.000000
10683          Congo, Dem. Rep.          COD  2000                    NaN
10684               Congo, Rep.          COG  2000                    NaN
10686                   Comoros          COM  2000              63.000000
10687                Cabo Verde          CPV  2000              69.444557
10696                  Djibouti       

In [14]:
# Garder uniquement les années >= 2000
df_africa_recent = df_africa[df_africa["Year"] >= 2000]

# Repasser en format large : colonnes = années
df_wide = df_africa_recent.pivot_table(
    index=["Country Name", "Country Code"],
    columns="Year",
    values="Literacy_Female_Adult"
)

# Réinitialiser l’index pour avoir un DataFrame classique
df_wide = df_wide.reset_index()

print(df_wide.head())


Year  Country Name Country Code  2000  2001  2002  2003  2004  2005  \
0          Algeria          DZA   NaN   NaN  60.0   NaN   NaN   NaN   
1           Angola          AGO   NaN  54.0   NaN   NaN   NaN   NaN   
2            Benin          BEN   NaN   NaN  23.0   NaN   NaN   NaN   
3         Botswana          BWA   NaN   NaN   NaN  82.0   NaN   NaN   
4     Burkina Faso          BFA   NaN   NaN   NaN  15.0   NaN  17.0   

Year       2006  2007  ...      2013  2014      2015  2016      2017  \
0     64.000000   NaN  ...       NaN   NaN       NaN   NaN       NaN   
1           NaN   NaN  ...       NaN  53.0  51.92598   NaN       NaN   
2     18.442909   NaN  ...       NaN   NaN       NaN   NaN  28.54985   
3           NaN   NaN  ...  87.45121   NaN       NaN   NaN       NaN   
4     16.000000  21.0  ...       NaN  26.0       NaN   NaN       NaN   

Year       2018       2019  2020       2021       2022  
0     75.322968  74.210197   NaN        NaN        NaN  
1           NaN        NaN

In [16]:
# Exemple avec df_wide (alphabétisation femmes adultes, format large)
df_wide.to_csv("literacy_female_adult_africa.csv", index=False)


In [17]:
import pandas as pd
import zipfile, io, requests

def download_wb_data(indicator):
    url = f"http://api.worldbank.org/v2/en/indicator/{indicator}?downloadformat=csv"
    r = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(r.content))

    # Prendre le bon fichier (API_...)
    data_file = [f for f in z.namelist() if f.startswith("API_")][0]

    df = pd.read_csv(z.open(data_file), header=2)
    return df

# Exemple : alphabétisation hommes adultes
df_male = download_wb_data("SE.ADT.LITR.MA.ZS")
print(df_male.head())


                  Country Name Country Code  \
0                        Aruba          ABW   
1  Africa Eastern and Southern          AFE   
2                  Afghanistan          AFG   
3   Africa Western and Central          AFW   
4                       Angola          AGO   

                                      Indicator Name     Indicator Code  1960  \
0  Literacy rate, adult male (% of males ages 15 ...  SE.ADT.LITR.MA.ZS   NaN   
1  Literacy rate, adult male (% of males ages 15 ...  SE.ADT.LITR.MA.ZS   NaN   
2  Literacy rate, adult male (% of males ages 15 ...  SE.ADT.LITR.MA.ZS   NaN   
3  Literacy rate, adult male (% of males ages 15 ...  SE.ADT.LITR.MA.ZS   NaN   
4  Literacy rate, adult male (% of males ages 15 ...  SE.ADT.LITR.MA.ZS   NaN   

   1961  1962  1963  1964  1965  ...       2016       2017       2018  \
0   NaN   NaN   NaN   NaN   NaN  ...        NaN        NaN        NaN   
1   NaN   NaN   NaN   NaN   NaN  ...  77.493492  77.130371  77.390427   
2   NaN   N

In [None]:
import pandas as pd
import zipfile, io, requests

african_countries = [
    "DZA","AGO","BEN","BWA","BFA","BDI","CMR","CPV","CAF","TCD","COM","COG","CIV",
    "COD","DJI","EGY","GNQ","ERI","SWZ","ETH","GAB","GMB","GHA","GIN","GNB","KEN",
    "LSO","LBR","LBY","MDG","MWI","MLI","MRT","MUS","MAR","MOZ","NAM","NER","NGA",
    "RWA","STP","SEN","SYC","SLE","ZAF","SSD","SDN","TZA","TGO","TUN","UGA","ZMB","ZWE"
]

def download_and_clean(indicator, colname):
    url = f"http://api.worldbank.org/v2/en/indicator/{indicator}?downloadformat=csv"
    r = requests.get(url)

    try:
        z = zipfile.ZipFile(io.BytesIO(r.content))
        data_file = [f for f in z.namelist() if f.startswith("API_")][0]
        df = pd.read_csv(z.open(data_file), header=2)
        df = df.drop(columns=["Indicator Name", "Indicator Code"], errors="ignore")
        df = df.melt(id_vars=["Country Name", "Country Code"],
                     var_name="Year", value_name=colname)
        df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
        return df
    except Exception as e:
        print(f" Indicateur {indicator} ({colname}) introuvable ou indisponible.")
        return None

indicators = {
    "Literacy_Female_Adult": "SE.ADT.LITR.FE.ZS",
    "Literacy_Male_Adult": "SE.ADT.LITR.MA.ZS",
    "Literacy_Female_Youth": "SE.ADT.1524.LT.FE.ZS",
    "Literacy_Male_Youth": "SE.ADT.1524.LT.MA.ZS",
    "GDP_per_capita": "NY.GDP.PCAP.KD",
    "Education_Expenditure": "SE.XPD.TOTL.GD.ZS",
    "Urban_Population": "SP.URB.TOTL.IN.ZS",
    "Poverty": "SI.POV.DDAY",
    "Child_Marriage_Under18": "SP.M18.2024.FE.ZS",
    "Child_Marriage_Under15": "SP.M15.2024.FE.ZS",
    "Net_Migration": "SM.POP.NETM",
    "Net_Migration_Percent": "SM.POP.NETM.ZS",
    "Fertility_Rate": "SP.DYN.TFRT.IN"
}

merged = None
for colname, code in indicators.items():
    df = download_and_clean(code, colname)
    if df is not None:
        df = df[df["Country Code"].isin(african_countries)]
        if merged is None:
            merged = df
        else:
            merged = merged.merge(df, on=["Country Name", "Country Code", "Year"], how="outer")

# Garder 2000–2023
merged = merged[(merged["Year"] >= 2000) & (merged["Year"] <= 2023)]

# Sauvegarde
merged.to_csv("Africa_Education_Development.csv", index=False)

print(" Dataset final créé : 'Africa_Education_Development.csv'")
print("Dimensions :", merged.shape)
print(merged.head())


⚠️ Indicateur SM.POP.NETM.ZS (Net_Migration_Percent) introuvable ou indisponible.
✅ Dataset final créé : 'Africa_Education_Development.csv'
Dimensions : (1272, 15)
   Country Name Country Code    Year  Literacy_Female_Adult  \
40      Algeria          DZA  2000.0                    NaN   
41      Algeria          DZA  2001.0                    NaN   
42      Algeria          DZA  2002.0                   60.0   
43      Algeria          DZA  2003.0                    NaN   
44      Algeria          DZA  2004.0                    NaN   

    Literacy_Male_Adult  Literacy_Female_Youth  Literacy_Male_Youth  \
40                  NaN                    NaN                  NaN   
41                  NaN                    NaN                  NaN   
42                 79.0                   86.0                 94.0   
43                  NaN                    NaN                  NaN   
44                  NaN                    NaN                  NaN   

    GDP_per_capita  Education_