In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("FAO,DF_SDG_2_1_1,1.0+all.csv")

# Filter for focus countries
focus_countries = ["India", "Nigeria", "Brazil"]
df = df[df["Area"].isin(focus_countries)]

# Keep only percentage-based entries
df = df[df["UNIT_MEASURE"] == "PT"]

# Select and rename columns
df = df[["Area", "TIME_PERIOD", "OBS_VALUE"]]
df.columns = ["Country", "Year", "Undernourishment (%)"]

# Clean '<2.5' and convert Year
df["Year"] = df["Year"].astype(int)
df["Undernourishment (%)"] = df["Undernourishment (%)"].replace(r"<", "", regex=True)

# Preview
df.head()

Unnamed: 0,Country,Year,Undernourishment (%)
4079,Nigeria,2001,8.8
4080,Nigeria,2002,8.8
4081,Nigeria,2003,8.5
4082,Nigeria,2004,7.8
4083,Nigeria,2005,7.0


In [2]:
# SARIMA model for Nigeria
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Filter training and test data
train = df[(df["Country"] == "Nigeria") & (df["Year"] <= 2015)].copy()
test = df[(df["Country"] == "Nigeria") & (df["Year"] >= 2016)].copy()
train["Undernourishment (%)"] = train["Undernourishment (%)"].astype(float)

# Check data characteristics
print("Nigeria data characteristics:")
print(f"Training data points: {len(train)}")
print(f"Range: {train['Undernourishment (%)'].min():.2f} to {train['Undernourishment (%)'].max():.2f}")
print(f"Mean: {train['Undernourishment (%)'].mean():.2f}")

# Fit SARIMA with enhanced seasonal configuration
model = auto_arima(
    train["Undernourishment (%)"],
    seasonal=True,       # Enable seasonality
    m=1,                 # Annual data
    start_p=0, max_p=3,  # Control AR terms
    start_q=0, max_q=3,  # Control MA terms
    d=None,              # Auto-detect differencing
    trace=True,          # Show model selection process
    error_action='ignore',
    suppress_warnings=True,
    stepwise=True        # Use stepwise selection for efficiency
)

# Print model details
print(f"\nBest model: {model.order}")

# Forecast for 10 years
forecast = model.predict(n_periods=10)

# Create forecast DataFrame
nigeria_sarima_forecast = pd.DataFrame({
    "Year": list(range(2016, 2026)),
    "SARIMA": forecast
})

Nigeria data characteristics:
Training data points: 15
Range: 6.50 to 10.90
Mean: 8.74
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=60.215, Time=0.21 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=37.091, Time=0.17 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.24 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=110.085, Time=0.04 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=17.313, Time=0.26 sec
 ARIMA(3,0,0)(0,0,0)[0] intercept   : AIC=18.307, Time=0.27 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=18.296, Time=0.18 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=26.725, Time=0.30 sec
 ARIMA(3,0,1)(0,0,0)[0] intercept   : AIC=20.282, Time=0.48 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=28.159, Time=0.11 sec

Best model:  ARIMA(2,0,0)(0,0,0)[0] intercept
Total fit time: 2.342 seconds

Best model: (2, 0, 0)


  return get_prediction_index(
  return get_prediction_index(


In [3]:
# Prepare evaluation data
eval_df = df[(df["Country"] == "Nigeria") & (df["Year"].between(2016, 2022))].copy()
eval_df["Undernourishment (%)"] = eval_df["Undernourishment (%)"].astype(float)

# Add SARIMA predictions
eval_df = eval_df.merge(
    nigeria_sarima_forecast[["Year", "SARIMA"]],
    on="Year",
    how="left"
)

# Create naive model (last observed value)
last_value = train["Undernourishment (%)"].iloc[-1]
eval_df["Naive"] = last_value

# Calculate metrics for SARIMA
y_true = eval_df["Undernourishment (%)"]
y_pred_sarima = eval_df["SARIMA"]
y_pred_naive = eval_df["Naive"]

# SARIMA metrics
mae_sarima = mean_absolute_error(y_true, y_pred_sarima)
rmse_sarima = mean_squared_error(y_true, y_pred_sarima, squared=False)
r2_sarima = r2_score(y_true, y_pred_sarima)

# Naive metrics
mae_naive = mean_absolute_error(y_true, y_pred_naive)
rmse_naive = mean_squared_error(y_true, y_pred_naive, squared=False)

# Print metrics
print("Nigeria - SARIMA Evaluation Metrics")
print(f"MAE: {mae_sarima:.4f}")
print(f"RMSE: {rmse_sarima:.4f}")
print(f"R²: {r2_sarima:.4f}")

# Compare with naive model
print("\nNaive model metrics:")
print(f"MAE: {mae_naive:.4f}")
print(f"RMSE: {rmse_naive:.4f}")

# Create comparison table
results = pd.DataFrame({
    "Year": eval_df["Year"],
    "Actual": y_true,
    "SARIMA": y_pred_sarima,
    "Error": y_true - y_pred_sarima
})

print("\nYear-by-year comparison:")
print(results[["Year", "Actual", "SARIMA", "Error"]])

Nigeria - SARIMA Evaluation Metrics
MAE: 5.5973
RMSE: 6.6062
R²: -5.1898

Naive model metrics:
MAE: 3.4286
RMSE: 4.3366

Year-by-year comparison:
   Year  Actual    SARIMA      Error
0  2016    10.7  9.987876   0.712124
1  2017    11.3  9.298853   2.001147
2  2018    11.8  8.582275   3.217725
3  2019    13.6  7.972493   5.627507
4  2020    15.1  7.566367   7.533633
5  2021    17.0  7.410642   9.589358
6  2022    18.0  7.500138  10.499862




In [4]:
# India SARIMA
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Prepare India training data (2001-2015)
train = df[(df["Country"] == "India") & (df["Year"] >= 2001) & (df["Year"] <= 2015)].copy()
test = df[(df["Country"] == "India") & (df["Year"] >= 2016)].copy()
train["Undernourishment (%)"] = train["Undernourishment (%)"].astype(float)

# Check training data
print("India training data summary:")
print(f"Number of years: {len(train)}")
print(f"Years: {min(train['Year'])} to {max(train['Year'])}")
print(f"Undernourishment range: {min(train['Undernourishment (%)']):.2f} to {max(train['Undernourishment (%)']):.2f}")

# Fit SARIMA model with seasonal optimization
model = auto_arima(
    train["Undernourishment (%)"],
    seasonal=True,      # Enable seasonality
    m=1,                # Annual data
    start_p=0, max_p=2, # Limit model complexity
    start_q=0, max_q=2,
    max_order=5,        # Limit total parameters
    trace=True,         # Show models tested
    suppress_warnings=True,
    stepwise=True,      # Faster search
    error_action='ignore'
)

# Print best model details
print(f"\nBest SARIMA model: {model.order}")
print(f"AIC: {model.aic()}")

# Forecast 10 years ahead (2016-2025)
forecast = model.predict(n_periods=10)

# Save forecasts to DataFrame
india_sarima_forecast = pd.DataFrame({
    "Year": list(range(2016, 2026)),
    "SARIMA": forecast
})

# Print forecast samples
print("\nForecast samples:")
print(f"2016 forecast: {india_sarima_forecast['SARIMA'].iloc[0]:.2f}")
print(f"2020 forecast: {india_sarima_forecast['SARIMA'].iloc[4]:.2f}")
print(f"2025 forecast: {india_sarima_forecast['SARIMA'].iloc[9]:.2f}")

India training data summary:
Number of years: 15
Years: 2001 to 2015
Undernourishment range: 12.20 to 22.00
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=46.409, Time=0.02 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=37.529, Time=0.08 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.25 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=46.448, Time=0.03 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=28.322, Time=0.11 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=30.322, Time=0.18 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=33.213, Time=0.09 sec
 ARIMA(2,1,0)(0,0,0)[0]             : AIC=29.116, Time=0.07 sec

Best model:  ARIMA(2,1,0)(0,0,0)[0] intercept
Total fit time: 0.863 seconds

Best SARIMA model: (2, 1, 0)
AIC: 28.322301645153203

Forecast samples:
2016 forecast: 11.64
2020 forecast: 9.02
2025 forecast: 6.47


  return get_prediction_index(
  return get_prediction_index(


In [5]:
# Evaluate on available actual values (2016-2022)
eval_df = df[(df["Country"] == "India") & df["Year"].between(2016, 2022)].copy()
eval_df["Undernourishment (%)"] = eval_df["Undernourishment (%)"].astype(float)

# Add SARIMA predictions
eval_df = eval_df.merge(
    india_sarima_forecast[["Year", "SARIMA"]],
    on="Year",
    how="left"
)

# Calculate simple baseline (last known value)
last_value = train["Undernourishment (%)"].iloc[-1]
eval_df["Baseline"] = last_value

# Calculate metrics
y_true = eval_df["Undernourishment (%)"]
y_pred = eval_df["SARIMA"]
y_baseline = eval_df["Baseline"]

# SARIMA metrics
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# Baseline metrics
baseline_mae = mean_absolute_error(y_true, y_baseline)
baseline_rmse = mean_squared_error(y_true, y_baseline, squared=False)

# Print metrics
print("India - SARIMA Evaluation Metrics")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Print baseline comparison
print(f"\nBaseline (constant) model metrics:")
print(f"MAE: {baseline_mae:.4f}")
print(f"RMSE: {baseline_rmse:.4f}")

# Create comparison table
comparison = pd.DataFrame({
    "Year": eval_df["Year"],
    "Actual": eval_df["Undernourishment (%)"],
    "SARIMA": eval_df["SARIMA"],
    "Error": eval_df["Undernourishment (%)"] - eval_df["SARIMA"]
})

print("\nYear-by-year comparison:")
print(comparison)

India - SARIMA Evaluation Metrics
MAE: 2.5421
RMSE: 3.4190
R²: -5.0256

Baseline (constant) model metrics:
MAE: 1.3000
RMSE: 1.3964

Year-by-year comparison:
   Year  Actual     SARIMA     Error
0  2016    11.5  11.638484 -0.138484
1  2017    10.5  10.950448 -0.450448
2  2018    10.3  10.230870  0.069130
3  2019    11.6   9.573485  2.026515
4  2020    13.1   9.021715  4.078285
5  2021    14.0   8.555272  5.444728
6  2022    13.7   8.112792  5.587208




In [6]:
# Brazil SARIMA
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Prepare training data (2001-2015)
train = df[(df["Country"] == "Brazil") & (df["Year"] >= 2001) & (df["Year"] <= 2015)].copy()
train["Undernourishment (%)"] = train["Undernourishment (%)"].replace(r"<", "", regex=True).astype(float)

# Check data characteristics for Brazil (low undernourishment country)
print("Brazil training data summary:")
print(f"Number of years: {len(train)}")
print(f"Min value: {train['Undernourishment (%)'].min():.2f}")
print(f"Max value: {train['Undernourishment (%)'].max():.2f}")
print(f"Mean value: {train['Undernourishment (%)'].mean():.2f}")

# Fit SARIMA model with improved configuration for low-variance data
model = auto_arima(
    train["Undernourishment (%)"],
    seasonal=True,
    m=1,                # Annual data
    start_p=0, max_p=2, # Limit complexity
    start_q=0, max_q=2,
    d=None,             # Auto-detect differencing
    max_d=1,            # Limit differencing
    trace=True,
    suppress_warnings=True,
    stepwise=True,
    error_action='ignore'
)

# Print model details
print(f"\nBest SARIMA model: {model.order}")
print(f"AIC: {model.aic()}")

# Forecast 10 years (2016-2025)
forecast = model.predict(n_periods=10)

# Ensure forecasts are not negative (impossible for undernourishment %)
forecast = np.maximum(forecast, train["Undernourishment (%)"].min())

# Store in DataFrame
brazil_sarima_forecast = pd.DataFrame({
    "Year": list(range(2016, 2026)),
    "SARIMA": forecast
})

# Print forecast samples
print("\nForecast samples:")
print(f"2016: {brazil_sarima_forecast['SARIMA'].iloc[0]:.2f}%")
print(f"2020: {brazil_sarima_forecast['SARIMA'].iloc[4]:.2f}%")
print(f"2025: {brazil_sarima_forecast['SARIMA'].iloc[9]:.2f}%")

Brazil training data summary:
Number of years: 15
Min value: 2.50
Max value: 10.40
Mean value: 5.13
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=19.901, Time=0.05 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=8.215, Time=0.13 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=12.464, Time=0.11 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=32.046, Time=0.05 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=6.941, Time=0.17 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=8.849, Time=0.16 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=8.271, Time=0.14 sec
 ARIMA(2,1,0)(0,0,0)[0]             : AIC=7.470, Time=0.09 sec

Best model:  ARIMA(2,1,0)(0,0,0)[0] intercept
Total fit time: 0.902 seconds

Best SARIMA model: (2, 1, 0)
AIC: 6.941358064652137

Forecast samples:
2016: 2.50%
2020: 2.50%
2025: 2.50%


  return get_prediction_index(
  return get_prediction_index(


In [7]:
# Prepare test data (2016-2022)
eval_df = df[(df["Country"] == "Brazil") & df["Year"].between(2016, 2022)].copy()
eval_df["Undernourishment (%)"] = eval_df["Undernourishment (%)"].replace(r"<", "", regex=True).astype(float)

# Add SARIMA predictions
eval_df = eval_df.merge(
    brazil_sarima_forecast[["Year", "SARIMA"]],
    on="Year",
    how="left"
)

# Add a simple baseline model (constant prediction)
baseline_value = train["Undernourishment (%)"].iloc[-1]  # Last observed value
eval_df["Baseline"] = baseline_value

# Calculate metrics
y_true = eval_df["Undernourishment (%)"]
y_pred = eval_df["SARIMA"]
y_baseline = eval_df["Baseline"]

# SARIMA metrics
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# Baseline metrics
baseline_mae = mean_absolute_error(y_true, y_baseline)
baseline_rmse = mean_squared_error(y_true, y_baseline, squared=False)

# Print SARIMA metrics
print("Brazil - SARIMA Evaluation Metrics")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Print baseline comparison
print(f"\nBaseline model metrics:")
print(f"MAE: {baseline_mae:.4f}")
print(f"RMSE: {baseline_rmse:.4f}")

# Calculate absolute errors
eval_df["Abs_Error"] = abs(eval_df["Undernourishment (%)"] - eval_df["SARIMA"])
eval_df["Baseline_Abs_Error"] = abs(eval_df["Undernourishment (%)"] - eval_df["Baseline"])

# Print comparison table
results = pd.DataFrame({
    "Year": eval_df["Year"],
    "Actual": eval_df["Undernourishment (%)"],
    "SARIMA": eval_df["SARIMA"],
    "Error": eval_df["Abs_Error"]
})

print("\nYear-by-year comparison:")
print(results)

# Check for significant improvements
years_improved = sum(eval_df["Abs_Error"] < eval_df["Baseline_Abs_Error"])
print(f"\nSARIMA outperformed baseline in {years_improved} out of {len(eval_df)} years")

Brazil - SARIMA Evaluation Metrics
MAE: 0.5714
RMSE: 0.8992
R²: -0.6774

Baseline model metrics:
MAE: 0.5714
RMSE: 0.8992

Year-by-year comparison:
   Year  Actual  SARIMA  Error
0  2016     2.5     2.5    0.0
1  2017     2.5     2.5    0.0
2  2018     2.5     2.5    0.0
3  2019     2.5     2.5    0.0
4  2020     3.4     2.5    0.9
5  2021     4.2     2.5    1.7
6  2022     3.9     2.5    1.4

SARIMA outperformed baseline in 0 out of 7 years




In [8]:
!pip install prophet



In [9]:
# Prophet model for Nigeria
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Filter and format training data (2001-2015)
train = df[(df["Country"] == "Nigeria") & (df["Year"] >= 2001) & (df["Year"] <= 2015)].copy()
train["Undernourishment (%)"] = train["Undernourishment (%)"].astype(float)

# Check training data
print("Nigeria training data for Prophet:")
print(f"Years: {min(train['Year'])} to {max(train['Year'])}")
print(f"Undernourishment range: {train['Undernourishment (%)'].min():.2f}% to {train['Undernourishment (%)'].max():.2f}%")

# Prepare data for Prophet (requires ds and y columns)
train_prophet = train.rename(columns={"Year": "ds", "Undernourishment (%)": "y"})
train_prophet["ds"] = pd.to_datetime(train_prophet["ds"], format="%Y")

# Configure and fit Prophet model
model = Prophet(
    yearly_seasonality=False,  # No yearly seasonality for annual data
    daily_seasonality=False,   # No daily seasonality
    changepoint_prior_scale=0.1,  # Allow moderate flexibility in trend changes
    changepoint_range=0.9      # Allow changepoints throughout most of the range
)

# Add model components
model.add_seasonality(
    name='custom',
    period=5,          # Look for 5-year cycles
    fourier_order=1    # Simple cycle
)

# Fit the model
model.fit(train_prophet)

# Generate future dataframe for forecasting
future = model.make_future_dataframe(periods=10, freq='Y')

# Make predictions
forecast = model.predict(future)

# Extract and format forecast for 2016-2025
forecast["Year"] = forecast["ds"].dt.year
predicted = forecast[forecast["Year"].between(2016, 2025)][["Year", "yhat", "yhat_lower", "yhat_upper"]]

# Ensure no negative predictions (undernourishment can't be negative)
predicted["yhat"] = predicted["yhat"].clip(lower=0)
predicted["yhat_lower"] = predicted["yhat_lower"].clip(lower=0)

# Rename for consistency with other models
nigeria_prophet_forecast = predicted.rename(columns={
    "yhat": "Prophet",
    "yhat_lower": "Lower_CI", 
    "yhat_upper": "Upper_CI"
})

# Display forecast statistics
print("\nForecast statistics:")
print(f"Min forecast: {nigeria_prophet_forecast['Prophet'].min():.2f}%")
print(f"Max forecast: {nigeria_prophet_forecast['Prophet'].max():.2f}%")
print(f"Average forecast: {nigeria_prophet_forecast['Prophet'].mean():.2f}%")

Nigeria training data for Prophet:
Years: 2001 to 2015
Undernourishment range: 6.50% to 10.90%


11:38:25 - cmdstanpy - INFO - Chain [1] start processing
11:38:25 - cmdstanpy - INFO - Chain [1] done processing



Forecast statistics:
Min forecast: 5.28%
Max forecast: 10.60%
Average forecast: 9.04%


  dates = pd.date_range(


In [10]:
# Prepare evaluation data (2016-2022)
eval_df = df[(df["Country"] == "Nigeria") & df["Year"].between(2016, 2022)].copy()
eval_df["Undernourishment (%)"] = eval_df["Undernourishment (%)"].astype(float)

# Add Prophet predictions
eval_df = eval_df.merge(
    nigeria_prophet_forecast[["Year", "Prophet"]],
    on="Year", 
    how="left"
)

# Create simple baseline model (last observed value)
last_value = train["Undernourishment (%)"].iloc[-1]
eval_df["Baseline"] = last_value

# Calculate metrics
y_true = eval_df["Undernourishment (%)"]
y_pred = eval_df["Prophet"]
y_baseline = eval_df["Baseline"]

# Prophet metrics
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# Baseline metrics
baseline_mae = mean_absolute_error(y_true, y_baseline)
baseline_rmse = mean_squared_error(y_true, y_baseline, squared=False)

# Print Prophet metrics
print("Nigeria - Prophet Evaluation Metrics")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Print baseline comparison
print(f"\nBaseline model metrics:")
print(f"MAE: {baseline_mae:.4f}")
print(f"RMSE: {baseline_rmse:.4f}")

# Create comparison table
results = pd.DataFrame({
    "Year": eval_df["Year"],
    "Actual": eval_df["Undernourishment (%)"],
    "Prophet": eval_df["Prophet"],
    "Error": eval_df["Undernourishment (%)"] - eval_df["Prophet"]
})

print("\nYear-by-year comparison:")
print(results)

Nigeria - Prophet Evaluation Metrics
MAE: 4.2607
RMSE: 5.5027
R²: -3.2946

Baseline model metrics:
MAE: 3.4286
RMSE: 4.3366

Year-by-year comparison:
   Year  Actual    Prophet     Error
0  2016    10.7  10.597133  0.102867
1  2017    11.3  10.466091  0.833909
2  2018    11.8  10.335050  1.464950
3  2019    13.6  10.204009  3.395991
4  2020    15.1   8.821921  6.278079
5  2021    17.0   8.690879  8.309121
6  2022    18.0   8.559838  9.440162




In [11]:
# India Prophet Model
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Prepare training data (2001-2015)
train = df[(df["Country"] == "India") & (df["Year"].between(2001, 2015))].copy()
train["Undernourishment (%)"] = train["Undernourishment (%)"].astype(float)

# Examine training data
print("India training data:")
print(f"Number of years: {len(train)}")
print(f"Year range: {train['Year'].min()} to {train['Year'].max()}")
print(f"Undernourishment range: {train['Undernourishment (%)'].min():.2f}% to {train['Undernourishment (%)'].max():.2f}%")
print(f"Trend direction: {'Decreasing' if train['Undernourishment (%)'].iloc[0] > train['Undernourishment (%)'].iloc[-1] else 'Increasing'}")

# Format data for Prophet (requires ds and y columns)
train_prophet = train.rename(columns={"Year": "ds", "Undernourishment (%)": "y"})
train_prophet["ds"] = pd.to_datetime(train_prophet["ds"], format="%Y")

# Configure Prophet model with parameters suited to India's data
model = Prophet(
    yearly_seasonality=False,        # No yearly seasonality for annual data
    daily_seasonality=False,         # No daily seasonality
    changepoint_prior_scale=0.1      # Allow some flexibility for trend changes
)

# Fit the model
model.fit(train_prophet)

# Create future dates for forecasting
future = model.make_future_dataframe(periods=10, freq='Y')

# Generate forecast
forecast = model.predict(future)

# Extract forecast for 2016-2025
forecast["Year"] = forecast["ds"].dt.year
forecast_subset = forecast[forecast["Year"].between(2016, 2025)]

# Extract relevant columns
predicted = forecast_subset[["Year", "yhat"]]

# Ensure no negative values (undernourishment can't be negative)
predicted["yhat"] = predicted["yhat"].clip(lower=0)

# Rename columns for consistency
india_prophet_forecast = predicted.rename(columns={"yhat": "Prophet"})

# Display forecast summary
print("\nIndia Prophet forecast summary:")
print(f"2016 forecast: {india_prophet_forecast['Prophet'].values[0]:.2f}%")
print(f"2020 forecast: {india_prophet_forecast['Prophet'].values[4]:.2f}%")
print(f"2025 forecast: {india_prophet_forecast['Prophet'].values[-1]:.2f}%")

11:38:25 - cmdstanpy - INFO - Chain [1] start processing
11:38:25 - cmdstanpy - INFO - Chain [1] done processing


India training data:
Number of years: 15
Year range: 2001 to 2015
Undernourishment range: 12.20% to 22.00%
Trend direction: Decreasing


  dates = pd.date_range(



India Prophet forecast summary:
2016 forecast: 10.42%
2020 forecast: 7.64%
2025 forecast: 4.86%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted["yhat"] = predicted["yhat"].clip(lower=0)


In [12]:
# Prepare evaluation data (2016-2022 actuals)
eval_df = df[(df["Country"] == "India") & df["Year"].between(2016, 2022)].copy()
eval_df["Undernourishment (%)"] = eval_df["Undernourishment (%)"].astype(float)

# Add Prophet forecasts
eval_df = eval_df.merge(
    india_prophet_forecast[["Year", "Prophet"]],
    on="Year",
    how="left"
)

# Add simple trend baseline (last value)
last_value = train["Undernourishment (%)"].iloc[-1]
eval_df["Baseline"] = last_value

# Calculate metrics
y_true = eval_df["Undernourishment (%)"]
y_pred = eval_df["Prophet"]
y_baseline = eval_df["Baseline"]

# Prophet metrics
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# Baseline metrics
baseline_mae = mean_absolute_error(y_true, y_baseline)
baseline_rmse = mean_squared_error(y_true, y_baseline, squared=False)

# Print Prophet metrics
print("India - Prophet Evaluation Metrics")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Print baseline comparison
print(f"\nBaseline model metrics:")
print(f"MAE: {baseline_mae:.4f}")
print(f"RMSE: {baseline_rmse:.4f}")

# Calculate prediction errors
eval_df["Abs_Error"] = abs(eval_df["Undernourishment (%)"] - eval_df["Prophet"])
eval_df["Pct_Error"] = 100 * eval_df["Abs_Error"] / eval_df["Undernourishment (%)"]

# Create comparison table
results = pd.DataFrame({
    "Year": eval_df["Year"],
    "Actual": eval_df["Undernourishment (%)"],
    "Prophet": eval_df["Prophet"],
    "Error": eval_df["Abs_Error"],
    "Error_%": eval_df["Pct_Error"].round(1)
})

print("\nYear-by-year comparison:")
print(results[["Year", "Actual", "Prophet", "Error"]])

# Identify years with largest errors
max_error_year = eval_df.loc[eval_df["Abs_Error"].idxmax(), "Year"]
print(f"\nLargest error in year: {max_error_year}")



India - Prophet Evaluation Metrics
MAE: 3.7629
RMSE: 4.6150
R²: -9.9787

Baseline model metrics:
MAE: 1.3000
RMSE: 1.3964

Year-by-year comparison:
   Year  Actual    Prophet     Error
0  2016    11.5  10.424692  1.075308
1  2017    10.5   9.729109  0.770891
2  2018    10.3   9.033526  1.266474
3  2019    11.6   8.337943  3.262057
4  2020    13.1   7.640454  5.459546
5  2021    14.0   6.944871  7.055129
6  2022    13.7   6.249287  7.450713

Largest error in year: 2022




In [13]:
# Brazil Prophet Model
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Prepare and clean training data (2001-2015)
train = df[(df["Country"] == "Brazil") & (df["Year"].between(2001, 2015))].copy()
train["Undernourishment (%)"] = train["Undernourishment (%)"].replace(r"<", "", regex=True).astype(float)

# Examine training data (Brazil has very low undernourishment)
print("Brazil training data:")
print(f"Number of years: {len(train)}")
print(f"Year range: {train['Year'].min()} to {train['Year'].max()}")
print(f"Undernourishment range: {train['Undernourishment (%)'].min():.2f}% to {train['Undernourishment (%)'].max():.2f}%")
print(f"Average: {train['Undernourishment (%)'].mean():.2f}%")

# Format data for Prophet (requires ds and y columns)
train_prophet = train.rename(columns={"Year": "ds", "Undernourishment (%)": "y"})
train_prophet["ds"] = pd.to_datetime(train_prophet["ds"], format="%Y")

# Configure Prophet model for Brazil's specific pattern
# (low values with minimal variation)
model = Prophet(
    yearly_seasonality=False,      # No yearly seasonality for annual data
    daily_seasonality=False,       # No daily seasonality
    changepoint_prior_scale=0.05,  # Lower flexibility for stable series
    n_changepoints=5              # Limit number of changepoints for smoother forecast
)

# Fit the model
model.fit(train_prophet)

# Create future dates for forecasting
future = model.make_future_dataframe(periods=10, freq='Y')

# Generate forecast
forecast = model.predict(future)

# Extract forecast for 2016-2025
forecast["Year"] = forecast["ds"].dt.year
predicted = forecast[forecast["Year"].between(2016, 2025)][["Year", "yhat"]]

# Create a clean copy to avoid warnings
predicted = predicted.copy()

# Ensure no negative values and set reasonable minimum based on history
min_value = train["Undernourishment (%)"].min()
predicted["yhat"] = predicted["yhat"].clip(lower=min_value)

# Rename for consistency
brazil_prophet_forecast = predicted.rename(columns={"yhat": "Prophet"})

# Display forecast summary
print("\nBrazil Prophet forecast summary:")
print(f"2016 forecast: {brazil_prophet_forecast['Prophet'].values[0]:.2f}%")
print(f"2020 forecast: {brazil_prophet_forecast['Prophet'].values[4]:.2f}%")
print(f"2025 forecast: {brazil_prophet_forecast['Prophet'].values[-1]:.2f}%")

11:38:26 - cmdstanpy - INFO - Chain [1] start processing


Brazil training data:
Number of years: 15
Year range: 2001 to 2015
Undernourishment range: 2.50% to 10.40%
Average: 5.13%


11:38:26 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(



Brazil Prophet forecast summary:
2016 forecast: 2.50%
2020 forecast: 2.50%
2025 forecast: 2.50%


In [14]:
# Prepare evaluation data (2016-2022)
eval_df = df[(df["Country"] == "Brazil") & df["Year"].between(2016, 2022)].copy()
eval_df["Undernourishment (%)"] = eval_df["Undernourishment (%)"].replace(r"<", "", regex=True).astype(float)

# Add Prophet predictions
eval_df = eval_df.merge(
    brazil_prophet_forecast[["Year", "Prophet"]],
    on="Year",
    how="left"
)

# Add naive baseline prediction (last observed value)
last_value = train["Undernourishment (%)"].iloc[-1]
eval_df["Baseline"] = last_value

# Calculate metrics
y_true = eval_df["Undernourishment (%)"]
y_pred = eval_df["Prophet"]
y_baseline = eval_df["Baseline"]

# Prophet metrics
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# Baseline metrics
baseline_mae = mean_absolute_error(y_true, y_baseline)
baseline_rmse = mean_squared_error(y_true, y_baseline, squared=False)

# Print Prophet metrics
print("Brazil - Prophet Evaluation Metrics")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Print baseline comparison
print(f"\nBaseline model (last value) metrics:")
print(f"MAE: {baseline_mae:.4f}")
print(f"RMSE: {baseline_rmse:.4f}")

# Create comparison table
results = pd.DataFrame({
    "Year": eval_df["Year"],
    "Actual": eval_df["Undernourishment (%)"],
    "Prophet": eval_df["Prophet"],
    "Baseline": eval_df["Baseline"],
    "Error": eval_df["Undernourishment (%)"] - eval_df["Prophet"]
})

print("\nYear-by-year comparison:")
print(results[["Year", "Actual", "Prophet", "Error"]])

Brazil - Prophet Evaluation Metrics
MAE: 0.5714
RMSE: 0.8992
R²: -0.6774

Baseline model (last value) metrics:
MAE: 0.5714
RMSE: 0.8992

Year-by-year comparison:
   Year  Actual  Prophet  Error
0  2016     2.5      2.5    0.0
1  2017     2.5      2.5    0.0
2  2018     2.5      2.5    0.0
3  2019     2.5      2.5    0.0
4  2020     3.4      2.5    0.9
5  2021     4.2      2.5    1.7
6  2022     3.9      2.5    1.4




In [16]:
import pandas as pd
# Load partial file
df = pd.read_csv("all_model_predictions_partial.csv")

# Filter and split per country
nigeria = df[df["Country"] == "Nigeria"].copy()
india = df[df["Country"] == "India"].copy()
brazil = df[df["Country"] == "Brazil"].copy()

# Merge SARIMA forecasts
nigeria = pd.merge(nigeria, nigeria_sarima_forecast, on="Year", how="left")
india = pd.merge(india, india_sarima_forecast, on="Year", how="left")
brazil = pd.merge(brazil, brazil_sarima_forecast, on="Year", how="left")

# Merge Prophet forecasts
nigeria = pd.merge(nigeria, nigeria_prophet_forecast, on="Year", how="left")
india = pd.merge(india, india_prophet_forecast, on="Year", how="left")
brazil = pd.merge(brazil, brazil_prophet_forecast, on="Year", how="left")

# Combine everything
final_df = pd.concat([nigeria, india, brazil], ignore_index=True)

# Clean up column names to ensure consistency
for col in final_df.columns:
    if 'sarima' in col.lower():
        final_df.rename(columns={col: "SARIMA"}, inplace=True)
    elif 'prophet' in col.lower():
        final_df.rename(columns={col: "Prophet"}, inplace=True)

# Save final CSV
final_df.to_csv("all_model_predictions_final.csv", index=False)