<a href="https://colab.research.google.com/github/Physic1990/Data_Science_final_project/blob/main/Future_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


ARIMA Model

In [2]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import plotly.graph_objects as go
import warnings

# Load the dataset with specified encoding
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/more_years.csv', encoding='ISO-8859-1')

# Data Preprocessing
# Remove unnecessary columns
#data = data.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)

# Convert numeric columns to the appropriate data type
for year in ['2007', '2008', '2014', '2015', '2022', '2023']:
    data[year] = pd.to_numeric(data[year], errors='coerce')

# Transpose the DataFrame to set 'Country' as index
data = data.set_index('Country').T

# Suppress ARIMA warnings
warnings.filterwarnings("ignore")

# Dictionary to store predictions for each country
predictions = {}

# Iterate over each country
for country in data.columns:
    try:
        # Copy the original data
        df = data[[country]].copy()

        # Convert the data type of the country column to numeric
        df[country] = pd.to_numeric(df[country], errors='coerce')

        # Train the ARIMA model
        model = ARIMA(df[country], order=(5, 1, 0))  # Example order, you may need to tune this
        model_fit = model.fit()

        # Make predictions for the next 50 years
        forecast = model_fit.forecast(steps=50)

        # Store the predictions
        predictions[country] = forecast
    except Exception as e:
        print(f"Error occurred while processing {country}: {e}")

# Create a figure
fig = go.Figure()

# Add traces for each country's predicted values
for country, forecast in predictions.items():
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast, mode='lines', name=country))

# Update layout
fig.update_layout(title='Predicted Hunger Index for Each Country Over the Next 50 Years',
                  xaxis_title='Year',
                  yaxis_title='Hunger Index')

# Show the plot
fig.show()

# Create individual line charts for each country
for country, forecast in predictions.items():
    # Create a figure
    fig = go.Figure()

    # Add trace for historical data
    fig.add_trace(go.Scatter(x=data.index, y=data[country], mode='lines', name='Historical Data'))

    # Add trace for predicted values
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast, mode='lines', name='Predicted Data'))

    # Update layout
    fig.update_layout(title=f'Hunger Index for {country} (Historical and Predicted)',
                      xaxis_title='Year',
                      yaxis_title='Hunger Index')

    # Show the plot
    fig.show()

Error occurred while processing Albania: LU decomposition error.
Error occurred while processing Sierra Leone: LU decomposition error.


ARIMA Model:
we're unable to find valid ARIMA parameters even with the expanded range, it might be beneficial to explore alternative modeling approaches.

In [1]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
import itertools
import numpy as np

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/more_years.csv', encoding='ISO-8859-1')

# Data Preprocessing
# Convert numeric columns to the appropriate data type
for year in ['2007', '2008', '2014', '2015','2022','2023']:
    data[year] = pd.to_numeric(data[year], errors='coerce')

# Interpolate missing values
data.interpolate(method='linear', inplace=True)

# Model Evaluation and Hyperparameter Tuning
best_score = float('inf')
best_params = None

# Define a wider range of ARIMA parameters to search
p_values = range(0, 5)  # range for p
d_values = range(0, 2)  # range for d (keeping it lower to avoid over-differencing)
q_values = range(0, 5)  # range for q

# Iterate over all possible combinations of p, d, and q
for p, d, q in itertools.product(p_values, d_values, q_values):
    order = (p, d, q)
    try:
        # Split the data into train and test sets
        train_size = int(len(data) * 0.8)
        train, test = data.iloc[:train_size], data.iloc[train_size:]

        # Fit the ARIMA model
        model = ARIMA(train, order=order)
        model_fit = model.fit()

        # Make predictions
        forecast = model_fit.forecast(steps=len(test))[0]

        # Calculate MAE and MSE
        mae = mean_absolute_error(test, forecast)
        mse = mean_squared_error(test, forecast)

        # Check if this is the best model so far
        if mse < best_score:
            best_score = mse
            best_params = order

        print(f'ARIMA Order: {order} | MAE: {mae:.2f} | MSE: {mse:.2f}')
    except:
        continue

# Check if valid ARIMA parameters were found
if best_params is not None:
    print(f'Best ARIMA Order: {best_params} | Best MSE: {best_score:.2f}')

    # Retrain the ARIMA model using the best parameters
    model = ARIMA(data, order=best_params)
    model_fit = model.fit()

    # Make predictions for the next 10 years
    forecast = model_fit.forecast(steps=10)

    # Print the forecasted values
    print("Forecasted GHI Scores for the Next 10 Years:")
    print(forecast)
else:
    print("No valid ARIMA parameters found even with the expanded range. Please consider other modeling approaches.")


No valid ARIMA parameters found even with the expanded range. Please consider other modeling approaches.


SARIMA model
The prediction for the next 10 years represents the forecasted Global Hunger Index (GHI) scores for a single time series, which is created by taking the mean across countries' GHI scores. Therefore, this prediction is not specifically for any individual country but rather represents an aggregate forecast for all countries included in the dataset.

The index values correspond to the predicted GHI scores for consecutive years. Each value indicates the forecasted GHI score for a specific year in the future, starting from the current year and extending for the next 10 years.

In [10]:
import pandas as pd
from pmdarima import auto_arima

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/more_years.csv', encoding='ISO-8859-1')

# Data Preprocessing
# Convert numeric columns to the appropriate data type
for year in ['2007', '2008', '2014', '2015','2022','2023']:
    data[year] = pd.to_numeric(data[year], errors='coerce')

# Interpolate missing values
data.interpolate(method='linear', inplace=True)

# Drop rows with any remaining missing values
data.dropna(inplace=True)

# Ensure all values are numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Create a single time series from the data
time_series = data.mean(axis=1)  # Assuming the mean across countries as the overall time series

# Parameter Selection using auto_arima
sarima_model = auto_arima(time_series, seasonal=True, m=12, stepwise=True, suppress_warnings=True, error_action="ignore")

# Fit the SARIMA model
sarima_model.fit(time_series)

# Forecasting for the next 10 years
forecast = sarima_model.predict(n_periods=10)

# Print the forecasted values
print("Forecasted GHI Scores for the Next 10 Years:")
print(forecast)


Forecasted GHI Scores for the Next 10 Years:
115    13.035201
116    16.454690
117    18.454809
118    16.413364
119    17.457370
120    17.116333
121    17.156322
122    17.193030
123    17.159875
124    17.176031
dtype: float64


SARIMA Model

In [27]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly.graph_objects as go
import warnings

# Load the dataset with specified encoding
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/more_years.csv', encoding='ISO-8859-1')

# Data Preprocessing
# Remove unnecessary columns
#data = data.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)

# Convert numeric columns to the appropriate data type
for year in ['2007', '2008', '2014', '2015', '2022', '2023']:
    data[year] = pd.to_numeric(data[year], errors='coerce')

# Transpose the DataFrame to set 'Country' as index
data = data.set_index('Country').T

# Suppress SARIMA warnings
warnings.filterwarnings("ignore")

# Dictionary to store predictions for each country
predictions = {}

# Iterate over each country
for country in data.columns:
    try:
        # Copy the original data
        df = data[[country]].copy()

        # Convert the data type of the country column to numeric
        df[country] = pd.to_numeric(df[country], errors='coerce')

        # Train the SARIMA model
        model = SARIMAX(df[country], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # Example orders, you may need to tune this
        model_fit = model.fit()

        # Make predictions for the next 50 years
        forecast = model_fit.forecast(steps=50)

        # Store the predictions
        predictions[country] = forecast
    except Exception as e:
        print(f"Error occurred while processing {country}: {e}")

# Create a figure
fig = go.Figure()

# Add traces for each country's predicted values
for country, forecast in predictions.items():
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast, mode='lines', name=country))

# Update layout
fig.update_layout(title='Predicted Hunger Index for Each Country Over the Next 50 Years',
                  xaxis_title='Year',
                  yaxis_title='Hunger Index')

# Show the plot
fig.show()

# Create individual line charts for each country
for country, forecast in predictions.items():
    # Create a figure
    fig = go.Figure()

    # Add trace for historical data
    fig.add_trace(go.Scatter(x=data.index, y=data[country], mode='lines', name='Historical Data'))

    # Add trace for predicted values
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast, mode='lines', name='Predicted Data'))

    # Update layout
    fig.update_layout(title=f'Hunger Index for {country} (Historical and Predicted)',
                      xaxis_title='Year',
                      yaxis_title='Hunger Index')

    # Show the plot
    fig.show()


The main difference between ARIMA (AutoRegressive Integrated Moving Average) and SARIMA (Seasonal AutoRegressive Integrated Moving Average) models lies in their ability to handle seasonality in time series data.

ARIMA (AutoRegressive Integrated Moving Average):

ARIMA models are designed to capture non-seasonal patterns in time series data.
They consist of three main components: AutoRegressive (AR), Integrated (I), and Moving Average (MA) terms.
The AR term captures the relationship between an observation and a certain number of lagged observations (auto-correlation).
The I term represents differencing of the series to make it stationary (i.e., removing trends).
The MA term captures the dependency between an observation and a residual error from a moving average model applied to lagged observations.


SARIMA (Seasonal AutoRegressive Integrated Moving Average):

SARIMA extends ARIMA to handle seasonal patterns in addition to non-seasonal ones.
In SARIMA, additional seasonal ARIMA terms are added to the model to capture seasonal variations in the data.
Seasonal ARIMA terms include seasonal auto-regressive (SAR), seasonal differencing (I), and seasonal moving average (SMA) terms.
The seasonal AR term captures the relationship between an observation and a lagged observation from the same season in previous years.
The seasonal I term represents differencing at the seasonal level to remove seasonal trends.
The seasonal MA term captures the dependency between an observation and a residual error from a moving average model applied to lagged observations within the same season.


In summary, while ARIMA is suitable for modeling non-seasonal time series data, SARIMA is more appropriate when dealing with time series data that exhibit seasonal patterns. By incorporating additional seasonal components, SARIMA models can provide more accurate forecasts for such data.


Code difference:



Model Evaluation: We'll calculate and print the RMSE for each country's predictions.
Residual Analysis: We'll plot the residuals to check for any patterns or anomalies.
Interactive Visualization: We'll use Plotly to create interactive plots for easier exploration of the data and predictions.

In [29]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly.graph_objects as go
import warnings
import numpy as np
from sklearn.metrics import mean_squared_error

# Load the dataset with specified encoding
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/more_years.csv', encoding='ISO-8859-1')

# Data Preprocessing
# Remove unnecessary columns
#data = data.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)

# Convert numeric columns to the appropriate data type
for year in ['2007', '2008', '2014', '2015', '2022', '2023']:
    data[year] = pd.to_numeric(data[year], errors='coerce')

# Transpose the DataFrame to set 'Country' as index
data = data.set_index('Country').T

# Suppress SARIMA warnings
warnings.filterwarnings("ignore")

# Dictionary to store predictions and residuals for each country
predictions = {}
residuals = {}

# Iterate over each country
for country in data.columns:
    try:
        # Copy the original data
        df = data[[country]].copy()

        # Convert the data type of the country column to numeric
        df[country] = pd.to_numeric(df[country], errors='coerce')

        # Train the SARIMA model
        model = SARIMAX(df[country], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # Example orders, you may need to tune this
        model_fit = model.fit()

        # Make predictions for the next 50 years
        forecast = model_fit.forecast(steps=50)

        # Debug statement to check lengths
        print(f"Country: {country}, Input Data Length: {len(df[country])}, Forecast Length: {len(forecast)}")

        # Store the predictions
        predictions[country] = forecast
    except Exception as e:
        print(f"Error occurred while processing {country}: {e}")


# Plot residuals for each country
for country, res in residuals.items():
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=actual.index, y=res, mode='lines', name='Residuals'))
    fig.update_layout(title=f'Residuals for {country}',
                      xaxis_title='Year',
                      yaxis_title='Residuals')
    fig.show()

# Create a figure for predicted values
fig = go.Figure()

# Add traces for each country's predicted values
for country, forecast in predictions.items():
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast, mode='lines', name=country))

# Update layout
fig.update_layout(title='Predicted Hunger Index for Each Country Over the Next 50 Years',
                  xaxis_title='Year',
                  yaxis_title='Hunger Index')

# Show the plot
fig.show()

# Create individual line charts for each country
for country, forecast in predictions.items():
    # Create a figure
    fig = go.Figure()

    # Add trace for historical data
    fig.add_trace(go.Scatter(x=data.index, y=data[country], mode='lines', name='Historical Data'))

    # Add trace for predicted values
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast, mode='lines', name='Predicted Data'))

    # Update layout
    fig.update_layout(title=f'Hunger Index for {country} (Historical and Predicted)',
                      xaxis_title='Year',
                      yaxis_title='Hunger Index')

    # Show the plot
    fig.show()


Country: Afghanistan, Input Data Length: 6, Forecast Length: 50
Country: Albania, Input Data Length: 6, Forecast Length: 50
Country: Algeria, Input Data Length: 6, Forecast Length: 50
Country: Angola, Input Data Length: 6, Forecast Length: 50
Country: Argentina, Input Data Length: 6, Forecast Length: 50
Country: Armenia, Input Data Length: 6, Forecast Length: 50
Country: Azerbaijan, Input Data Length: 6, Forecast Length: 50
Country: Bangladesh, Input Data Length: 6, Forecast Length: 50
Country: Belarus, Input Data Length: 6, Forecast Length: 50
Country: Benin, Input Data Length: 6, Forecast Length: 50
Country: Bolivia (Plurinational State of), Input Data Length: 6, Forecast Length: 50
Country: Bosnia & Herzegovina, Input Data Length: 6, Forecast Length: 50
Country: Botswana, Input Data Length: 6, Forecast Length: 50
Country: Brazil, Input Data Length: 6, Forecast Length: 50
Country: Bulgaria, Input Data Length: 6, Forecast Length: 50
Country: Burkina Faso, Input Data Length: 6, Forecas

Gaussian Process Regression (GPR)

In [33]:
import pandas as pd
import plotly.graph_objects as go
import warnings
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import numpy as np

# Load the dataset with specified encoding
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/more_years.csv', encoding='ISO-8859-1')

# Data Preprocessing
# Remove unnecessary columns
#data = data.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)

# Convert numeric columns to the appropriate data type
for year in ['2007', '2008', '2014', '2015', '2022', '2023']:
    data[year] = pd.to_numeric(data[year], errors='coerce')

# Transpose the DataFrame to set 'Country' as index
data = data.set_index('Country').T

# Suppress warnings
warnings.filterwarnings("ignore")

# Dictionary to store predictions and residuals for each country
predictions = {}

# Define kernel for Gaussian Process Regression
kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))

# Iterate over each country
for country in data.columns:
    try:
        # Copy the original data
        df = data[[country]].copy()

        # Convert the data type of the country column to numeric
        df[country] = pd.to_numeric(df[country], errors='coerce')

        # Prepare data for GPR
        X = np.array(df.index).reshape(-1, 1)
        y = df[country].values

        # Train the GPR model
        model = GaussianProcessRegressor(kernel=kernel, random_state=0)
        model.fit(X, y)

        # Make predictions for the next 50 years
        forecast_x = np.array(range(2023, 2073)).reshape(-1, 1)
        forecast = model.predict(forecast_x)

        # Debug statement to check lengths
       # print(f"Country: {country}, Input Data Length: {len(X)}, Forecast Length: {len(forecast)}")

        # Store the predictions
        predictions[country] = forecast
    except Exception as e:
        print(f"Error occurred while processing {country}: {e}")

# Create individual line charts for each country
for country, forecast in predictions.items():
    # Create a figure
    fig = go.Figure()

    # Add trace for historical data
    fig.add_trace(go.Scatter(x=data.index, y=data[country], mode='lines', name='Historical Data'))

    # Add trace for predicted values
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast, mode='lines', name='Predicted Data'))

    # Update layout
    fig.update_layout(title=f'Hunger Index for {country} (Historical and Predicted)',
                      xaxis_title='Year',
                      yaxis_title='Hunger Index')

    # Show the plot
    fig.show()


Country: Afghanistan, Input Data Length: 6, Forecast Length: 50
Country: Albania, Input Data Length: 6, Forecast Length: 50
Country: Algeria, Input Data Length: 6, Forecast Length: 50
Country: Angola, Input Data Length: 6, Forecast Length: 50
Country: Argentina, Input Data Length: 6, Forecast Length: 50
Country: Armenia, Input Data Length: 6, Forecast Length: 50
Country: Azerbaijan, Input Data Length: 6, Forecast Length: 50
Country: Bangladesh, Input Data Length: 6, Forecast Length: 50
Country: Belarus, Input Data Length: 6, Forecast Length: 50
Country: Benin, Input Data Length: 6, Forecast Length: 50
Country: Bolivia (Plurinational State of), Input Data Length: 6, Forecast Length: 50
Country: Bosnia & Herzegovina, Input Data Length: 6, Forecast Length: 50
Country: Botswana, Input Data Length: 6, Forecast Length: 50
Country: Brazil, Input Data Length: 6, Forecast Length: 50
Country: Bulgaria, Input Data Length: 6, Forecast Length: 50
Country: Burkina Faso, Input Data Length: 6, Forecas

In [4]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly.graph_objects as go
import warnings
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import numpy as np

# Load the dataset with specified encoding
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/more_years.csv', encoding='ISO-8859-1')

# Data Preprocessing
#data = data.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)

for year in ['2007', '2008', '2014', '2015', '2022', '2023']:
    data[year] = pd.to_numeric(data[year], errors='coerce')

# Transpose the DataFrame to set 'Country' as index
data = data.set_index('Country').T

# Suppress warnings
warnings.filterwarnings("ignore")

# Dictionary to store predictions for each model
predictions_arima = {}
predictions_sarima = {}
predictions_gpr = {}

# Iterate over each country
country = 'Afghanistan'  # Change this to the desired country
try:
    # Copy the original data
    df = data[[country]].copy()

    # Convert the data type of the country column to numeric
    df[country] = pd.to_numeric(df[country], errors='coerce')

    # Train the ARIMA model
    model_arima = ARIMA(df[country], order=(5, 1, 0))
    model_fit_arima = model_arima.fit()
    forecast_arima = model_fit_arima.forecast(steps=50)
    predictions_arima[country] = forecast_arima

    # Train the SARIMA model
    model_sarima = SARIMAX(df[country], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    model_fit_sarima = model_sarima.fit()
    forecast_sarima = model_fit_sarima.forecast(steps=50)
    predictions_sarima[country] = forecast_sarima

    # Train the Gaussian Process Regression model
    kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
    model_gpr = GaussianProcessRegressor(kernel=kernel, random_state=0)
    X = np.array(df.index).reshape(-1, 1)
    y = df[country].values
    model_gpr.fit(X, y)
    forecast_x_gpr = np.array(range(2023, 2073)).reshape(-1, 1)
    forecast_gpr = model_gpr.predict(forecast_x_gpr)
    predictions_gpr[country] = forecast_gpr

    # Create a figure for predicted values
    fig = go.Figure()

    # Add traces for each model's predicted values
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast_arima, mode='lines', name='ARIMA'))
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast_sarima, mode='lines', name='SARIMA'))
    fig.add_trace(go.Scatter(x=list(range(2023, 2073)), y=forecast_gpr, mode='lines', name='GPR'))

    # Update layout
    fig.update_layout(title=f'Predicted Hunger Index for {country} Over the Next 50 Years',
                      xaxis_title='Year',
                      yaxis_title='Hunger Index')

    # Show the plot
    fig.show()
except Exception as e:
    print(f"Error occurred while processing {country}: {e}")
