In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
import plotly.express as px

In [None]:
def predict_next_5_years(df):
    future_years=[2022, 2023, 2024, 2025, 2026, 2027]
    # Create a DataFrame to store predictions, excluding NaN values
    geo = df['geo'].unique()[0]
    geo_predictions_df = pd.DataFrame({
        'geo': [geo, geo, geo, geo, geo, geo],
        'Year': future_years,
    })
    df = df.drop('geo', axis=1)
    # Sort Data by Year
    geo_data = df.sort_values(by='Year')
    # Exclude rows with NaN values in the target variable
    geo_data = geo_data.dropna(subset=geo_data.columns)
    print("geo_data")
    if not geo_data.empty: # Check if there are samples after excluding
      for column in geo_data.columns:
        if column not in ['Year', 'geo']:
          # Fit linear regression of y on t
          pipeline = make_pipeline(
              PolynomialFeatures(degree=1, include_bias=False),
              LinearRegression()
          )
          t = geo_data['Year']
          y_train = geo_data[column]
          pipeline.fit(X=t.to_frame(), y=y_train)
          future_predictions = pipeline.predict(np.array(future_years).reshape(-1, 1))
          geo_predictions_df[column] = future_predictions.round(6)

    else:  # If no samples, create a NaN row
      geo_predictions_df[column] = [np.nan]


    return geo_predictions_df.sort_values(by=['geo', 'Year'], ascending=[True, True])


In [None]:
df_eurostat = pd.read_csv("https://raw.githubusercontent.com/SDuncan5/Eurostat-Data/main/eurostat_pcap_no_nans.csv")
df_eurostat = df_eurostat.drop(columns=["Unnamed: 0"])
df_eurostat

Unnamed: 0,geo,Year,CPI,Immigrants,Population,Housing Index,GDP,emigration,unemployment,total_deaths,Exports,Imports,Immigrants_pcap,GDP_pcap,Emigrants_pcap,Deaths_pcap,Exports_pcap,Imports_pcap
0,Austria,2011,93.35,82230.0,8391643.0,81.60,310128.7,51197.0,3.3,76479.0,127462.4,137512.5,0.009799,36956.851000,0.006101,0.009114,0.015189,0.016387
1,Austria,2012,95.75,91557.0,8429991.0,87.57,318653.0,51812.0,3.5,79436.0,129678.5,138942.4,0.010861,37799.921732,0.006146,0.009423,0.015383,0.016482
2,Austria,2013,97.77,101866.0,8479823.0,92.10,323910.2,54071.0,3.8,79526.0,131884.6,137999.8,0.012013,38197.754835,0.006376,0.009378,0.015553,0.016274
3,Austria,2014,99.20,116262.0,8546356.0,95.33,333146.1,53491.0,4.0,78252.0,134172.5,137001.2,0.013604,38981.069827,0.006259,0.009156,0.015699,0.016030
4,Austria,2015,100.00,166323.0,8642699.0,100.00,344269.2,56689.0,4.1,83073.0,137756.8,140699.2,0.019244,39833.528855,0.006559,0.009612,0.015939,0.016280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,Slovakia,2017,100.90,7188.0,5439232.0,112.99,84669.9,3466.0,5.4,53914.0,73790.1,72191.9,0.001322,15566.517479,0.000637,0.009912,0.013566,0.013272
281,Slovakia,2018,103.46,7253.0,5446771.0,121.32,89874.7,3298.0,4.3,54293.0,79136.9,78727.4,0.001332,16500.546838,0.000605,0.009968,0.014529,0.014454
282,Slovakia,2019,106.33,7016.0,5454147.0,132.39,94429.7,3384.0,3.8,53234.0,79962.0,80407.4,0.001286,17313.376409,0.000620,0.009760,0.014661,0.014742
283,Slovakia,2020,108.47,6775.0,5458827.0,145.06,93444.1,2428.0,4.4,59089.0,75586.2,73700.2,0.001241,17117.981574,0.000445,0.010824,0.013847,0.013501


In [None]:
predicted_df = pd.DataFrame()

for geo in df_eurostat['geo'].unique():
    geo_data = df_eurostat[df_eurostat['geo'] == geo]

    # Apply the time series prediction function
    predicted_geo_data = predict_next_5_years(geo_data)

    # Append the predicted data for the current geo to the overall DataFrame
    predicted_df = pd.concat([predicted_df, predicted_geo_data], ignore_index=True)

In [None]:
fig1 = px.line(predicted_df, x="Year", y="GDP", color="geo", markers=True, title="Predicted GDP", width=800, height=800)
fig1.show()