<a href="https://colab.research.google.com/github/VictorSylva/kaggle/blob/main/Monthly_food_price_inflation_estimates_by_country.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
harshalhonde_monthly_food_price_inflation_estimates_by_country_path = kagglehub.dataset_download('harshalhonde/monthly-food-price-inflation-estimates-by-country')

print('Data source import complete.')


# **IMPORT OUR LIBRARIES**

In [None]:
import pandas as pd #Let's import the libraries that we'll use
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error

In [None]:
#Load our data

inflation = pd.read_csv('/kaggle/input/monthly-food-price-inflation-estimates-by-country/WLD_RTFP_country_2023-07-31.csv')

In [None]:
# Let's check our data

inflation

**As you can see our data contains 8 different columns that we have to clarify which ones we'll use **

In [None]:
#Let's check the amount of null values in our data

inflation.isnull().sum()

In [None]:
# Let's check the shape

inflation.shape

In [None]:
# Let's check the info

inflation.info()

In [None]:
# Let's create our correlation heatmap with seaborn, for the sake of graph let's drop columns which contains object data
inflation_numeric = inflation.drop(['country', 'ISO3', 'date'], axis=1)

sns.heatmap(inflation_numeric.corr(), cmap='crest', linewidth=.5, annot=True, square=True)
plt.show()

# ANALYZING, VISUALIZATION AND DATA PREPROCESSING

In [None]:
#Let's first deal with the null values so we can visualize easily

inflation.isnull().sum()

In [None]:
# In Open, High, Low and Close columns, I think the method is filling the null values with mode(you can use other methods too)
mode_open = inflation['Open'].mode().iloc[0]
inflation['Open'] = inflation['Open'].fillna(value=mode_open)

mode_close = inflation['Close'].mode().iloc[0]
inflation['Close'] = inflation['Close'].fillna(value=mode_close)

mode_high = inflation['High'].mode().iloc[0]
inflation['High'] = inflation['High'].fillna(value=mode_high)

mode_low = inflation['Low'].mode().iloc[0]
inflation['Low'] = inflation['Low'].fillna(value=mode_low)

In [None]:
#I used a basic way to deal with null values because number of columns in our data is low(so you can try more advanced ways)
inflation.isnull().sum()


In [None]:
#Now let's convert our date column object to datetime

inflation['date'] = pd.to_datetime(inflation['date'])

In [None]:
#Let's make date an index because we are working on a time-series data
inflation.set_index('date', inplace=True)

In [None]:
#I think the best way to handle with null values in Inflation column is to delete them because filling them can make our model worse

inflation = inflation.dropna(axis=0)

In [None]:
# Now our data is clean so we can make some visualization

plt.figure(figsize=(10, 6))
plt.plot(inflation['Close'], label='Close Value', color='blue')
plt.plot(inflation['High'], label='High Value', color='red')
plt.plot(inflation['Low'], label='Low Value', color='green')
plt.plot(inflation['Open'], label='Open Value', color='purple')
plt.title('Time Series Plot of Close Price')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
inflation.describe()

In [None]:
unique_countries = inflation['country'].unique()

In [None]:
#Let's check time-series for each country
for country in unique_countries:
    country_data = inflation[inflation['country'] == country]

    plt.figure(figsize=(10, 6))
    plt.plot(country_data['Open'], label='Open Price', color='red')
    plt.plot(country_data['High'], label='High Price', color='green')
    plt.plot(country_data['Low'], label='Low Price', color='blue')
    plt.plot(country_data['Close'], label='Close Price', color='black')

    plt.title(f'Time Series Plot of Price Variations - {country}')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.grid(True)
    plt.show()










In [None]:
for country in unique_countries:
    country_data = inflation[inflation['country'] == country]
    plt.figure(figsize=(10,6))
    plt.plot(country_data['Inflation'], label='Inflation', color='red')

    plt.title(f'Time Series Plot of Inflation Variations - {country}')
    plt.xlabel('Date')
    plt.ylabel('Inflation')
    plt.legend()
    plt.grid(True)
    plt.show()

# STATIONARITY TEST

In [None]:
def adf_test(series):
    result = adfuller(series)
    return result[1]

In [None]:
#Now let's make a stationarity test with the help of ADF test
for country in unique_countries:
    country_data = inflation[inflation['country'] == country]
    p_value = adf_test(country_data['Inflation'])

    print(f"Country: {country}, p-value: {p_value:.4f}")


Now we can understand that some countries inflation time series is stationary some of them is not (if p-value is less than 0.05 it mean its stationary)

# TRAIN ARIMA AND ARMA MODEL

In [None]:
#Let's pick a specific country and use one of these models

country_to_train = 'Iraq'
country_data = inflation[inflation['country'] == country_to_train]['Inflation']

In [None]:
country_data

In [None]:
plt.figure(figsize=(12, 6))
plot_acf(country_data, lags=30, title=f'ACF for {country_to_train} Inflation')
plot_pacf(country_data, lags=30, title=f'PACF for {country_to_train} Inflation')
plt.show()

In [None]:
train_size = int(0.8 * len(country_data))
train_data, test_data = country_data[:train_size], country_data[train_size:]

In [None]:
model = ARIMA(train_data, order=(2, 1, 1))
model_fit = model.fit()

In [None]:
predictions = model_fit.forecast(steps=len(test_data))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(test_data.index, test_data, label='Actual Inflation', color='blue')
plt.plot(test_data.index, predictions, label='Predicted Inflation', color='red')
plt.title(f'Predicted vs. Actual Inflation for {country_to_train}')
plt.xlabel('Date')
plt.ylabel('Inflation')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
mse = mean_squared_error(test_data, predictions)
print(f"Mean Squared Error: {mse:.4f}")

In [None]:
forecast_steps = 12  # Number of steps to forecast
forecast = model_fit.forecast(steps=forecast_steps)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(country_data.index, country_data, label='Historical Inflation', color='blue')
plt.plot(forecast.index, forecast, label='Forecasted Inflation', color='red')
plt.title(f'Forecasted Inflation for {country_to_train}')
plt.xlabel('Date')
plt.ylabel('Inflation')
plt.legend()
plt.grid(True)
plt.show()

# CONCLUSION

OUR ARIMA MODEL DID NOT PERFOMED SO WELL BUT WE GET MEAN SQUARED ERROR:9 , BUT IF WE LOOK AT THE GRAPH PERDICTION IS NOT THAT GOOD, SO WE HAVE TO CHANGE DIFFERENT PARAMETERS OR USE FINE TUNNING MODELS