In [3]:
# Define the area
def load_the_database(area):
    
    # database
    coucodes = pd.read_csv("CountryCodes.csv", sep = ";")
    countries = coucodes["ISO Code"].tolist()

    # importing education database

    education = pd.read_csv('tickers.csv', encoding = "ISO-8859-1")
    education = education[education["Topic"] == area]
    education_WB = education[education["Source"] == "WB"]
    education_WB = education_WB["Indicator ticker"].tolist()

    # downloading data from worldbank

    worldbank_data = pandas_datareader.wb.download(indicator = education_WB, country = countries, start = 2010, end = 2020, freq = "A")
    worldbank_data2 = pandas_datareader.wb.download(indicator = education_WB, country = countries, start = 2000, end = 2009, freq = "A")
    worldbank_data3 = pandas_datareader.wb.download(indicator = education_WB, country = countries, start = 1990, end = 1999, freq = "A")

    worldbank_data = pd.concat([worldbank_data, worldbank_data2])
    worldbank_data = pd.concat([worldbank_data, worldbank_data3])

    # downloading data from UN

    education_UN_SDG = education[education["Source"] == "UN_SDG"]

    return (worldbank_data, education)

In [4]:
# Treat the missing data

def impute_missing():
    
    global worldbank_data_idx
    worldbank_data_idx = worldbank_data_idx.groupby("country").apply(lambda group: group.interpolate(method = "linear", limit_area = "inside"))
    worldbank_data_ind = worldbank_data_idx.set_index('country')
    worldbank_data_ind["country"] = worldbank_data_ind.index
    worldbank_data_hotenco = pd.get_dummies(worldbank_data_ind, columns=["country"])
    worldbank_data_hotenco.replace([np.inf, -np.inf], np.nan, inplace = True)
    worldbank_data_filled = pd.DataFrame(IterativeImputer().fit_transform(worldbank_data_hotenco))
    worldbank_data_filled.columns = worldbank_data_hotenco.columns
    worldbank_data_filled["country"] = worldbank_data_idx["country"]
    worldbank_data_filled.set_index("country", inplace = True)
    worldbank_data_filled = worldbank_data_filled[worldbank_data_filled.columns.drop(list(worldbank_data_filled.filter(regex='country')))]
    return worldbank_data_filled

In [5]:
#scaling the variables
#Using nonlinear scaler and Holt smoothing to forecast

class SklearnWrapper:
    def __init__(self, transform: typing.Callable):
        self.transform = transform

    def __call__(self, df):
        transformed = self.transform.fit_transform(df.values)
        return pd.DataFrame(transformed, columns=df.columns, index=df.index)




def scale_and_forecast():

    x = worldbank_data_filled.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler() #min-max
    nonlinear_scaler = preprocessing.QuantileTransformer(random_state=0) #non linear
    std_scaler = preprocessing.StandardScaler() #std
    
    x_scaled = nonlinear_scaler.fit_transform(x) # change between min_max or non linear


    worldbank_data_scaled = pd.DataFrame(x_scaled, columns = worldbank_data_filled.columns) #max-min or non linear

    worldbank_data_scaled["year"] = worldbank_data_filled["year"].values.astype(int)
    worldbank_data_scaled["country"] = worldbank_data_filled.index
    
    #worldbank_data_filled.reset_index(inplace = True)
    #worldbank_data_filled.set_index(["country", "year"], inplace = True)
    #worldbank_data_scaled = worldbank_data_filled.groupby("year").apply(SklearnWrapper(preprocessing.StandardScaler()))
    #worldbank_data_scaled.reset_index(inplace = True)
    
    #forecasting the indicators

    worldbank_data_scaled2 = worldbank_data_scaled.iloc[::-1]
    worldbank_data_scaled2 = worldbank_data_scaled2.replace(0, 0.01)
    forecast = []
    IC1 = []
    IC2 = []
    for c in worldbank_data_scaled2["country"].unique():
        worldbank_data_scaled3 = worldbank_data_scaled2[worldbank_data_scaled2["country"] == c]
        worldbank_data_scaled3.set_index(["country", "year"], inplace = True)
        results = []
        min = []
        max = []
        cols = []
        for i in worldbank_data_scaled3.columns:
            cols.append(i)
            model = Holt(np.asarray(worldbank_data_scaled3[i]), damped = True)
            fit = model.fit()
            pred = fit.forecast(5)
            #model = pm.auto_arima(np.asarray(worldbank_data_scaled3[i]), trace=True, error_action='ignore',conf_int = True, suppress_warnings=True, maxiter = 50)
            #pred, IC = model.predict(5, return_conf_int=True)
            #first_ic = IC[:,0]
            #second_ic = IC[:,1]
            results.append(pred)
            #min.append(first_ic)
            #max.append(second_ic)
        
        #min2 = (pd.DataFrame(min)).T
        #min2.columns = cols
        #min2["year"] = [2020,2021,2022,2023,2024]
        #min2["country"] = c
        #IC1.append(min2)
        #max2 = (pd.DataFrame(max)).T
        #max2.columns = cols
        #max2["year"] = [2020,2021,2022,2023,2024]
        #max2["country"] = c
        #IC2.append(max2)       
        
        output = (pd.DataFrame(results)).T
        output.columns=cols
        output["year"] =[2020,2021,2022,2023,2024]
        output["country"] = c
        forecast.append(output)

        
    forecast = pd.concat(forecast, axis=0, ignore_index=True)
    #IC1 = pd.concat(IC1, axis=0, ignore_index=True)
    #IC2 = pd.concat(IC2, axis=0, ignore_index=True)


    frames = [worldbank_data_scaled, forecast]
    worldbank_data_scaled = pd.concat(frames)
    worldbank_data_scaled.set_index(["country",'year'], inplace = True)
    
    return worldbank_data_scaled#, IC1, IC2

In [7]:
def plot_variable():

    merged_data["forecast"] = np.where(merged_data['year'] > 2019, merged_data["indicator"], '')
    merged_data["indicator"] = np.where(merged_data['year'] < 2020, merged_data["indicator"], '')



    lista = merged_data["country"].unique()

    for iso in lista:


        merged_data_c = merged_data[merged_data["country"] == iso]

        merged_data_c.sort_values(by=['year'], inplace = True)

        fig = go.Figure()


        fig.add_trace(go.Scatter(
                            x=merged_data_c["year"],
                            y=merged_data_c['indicator'],
                            name="Indicator",
                            line_color='green',
                            opacity=1,
                            line=dict(color='deepskyblue', width=4
                                          )))
        fig.add_trace(go.Scatter(
                            x=merged_data_c["year"],
                            y=merged_data_c['forecast'],
                            name="Forecast",
                            line_color='rgb(264, 45, 45)',
                            line=dict(color='deepskyblue', width=4, dash='dot'
                                          )))
        
        fig.add_trace(go.Scatter(
                            x=merged_data_c["year"],
                            y=merged_data_c['ic1'],
                            name="Lower Bound",
                            marker=dict(color="#444"),
                            line=dict(width=0),
                            mode='lines'))        
        
        fig.add_trace(go.Scatter(
                            x=merged_data_c["year"],
                            y=merged_data_c['ic2'],
                            name="Upper Bound",
                            marker=dict(color="#444"),
                            line=dict(width=0),
                            mode='lines',
                            fillcolor='rgba(68, 68, 68, 0.3)',
                            fill='tonexty'))    
                
        
        fig.update_layout(
                autosize=False,
                width=1000,
                height=600,
                yaxis=dict(
                range=[0, 1]),
                xaxis = dict(
                tickangle=45)
            )

        fig.write_image(iso + "_" + area + "_" + "indicator.png")