In [None]:
import sys
import os

from pyspark.sql import SparkSession, DataFrame

from pathlib import Path

parent_dir = Path().resolve().parent

if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))
    
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

parent_dir = str(parent_dir).replace("\\", "/") + "/"


spark = SparkSession.builder.appName("ukrainian_prices").getOrCreate()

In [None]:
import pandas as pd 
import numpy as np

from data.schema.ukraine_price_dataset import UKRAINE_PRICE_SCHEMA, UKRAINE_PRICE_COLUMNS

In [None]:
from bokeh.plotting import figure, show
from bokeh.models import Span
from bokeh.io import output_notebook
from bokeh.models import Legend
from bokeh.layouts import column
from bokeh.models import ColumnDataSource

from streamlit import bokeh_chart

output_notebook()


def visualize_predictions(real_data, train_predictions, test_predictions, title='Predictions'):
    p = figure(title=title, x_axis_label='Час', x_axis_type='datetime',
               y_axis_label='Значення', width=800, height=400)
    # Додавання реальних даних
    p.line(x=real_data['date'], y=real_data['price'], line_width=2,
           color='red', legend_label='Real Data')
    # Predictions Data
    # Training Data
    p.line(x=train_predictions['date'], y=train_predictions['mean'], line_width=2,
           color='blue', legend_label='Реальні дані')
    train_columns = train_predictions.columns.to_list()
    if ('upper' in train_columns and 'lower' in train_columns):
        p.varea(x=train_predictions['date'], y1=train_predictions['lower'],
                y2=train_predictions['upper'], color='blue', alpha=0.2)
        # Test Data
    p.line(x=test_predictions['date'], y=test_predictions['mean'], line_width=2,
           color='orange', legend_label='Передбачення')
    test_columns = test_predictions.columns.to_list()
    if ('upper' in test_columns and 'lower' in test_columns):
        p.varea(x=test_predictions['date'], y1=test_predictions['lower'],
                y2=test_predictions['upper'], color='orange', alpha=0.2)

    # Додавання вертикальних ліній для розмежування тренувального і тестового наборів
    train_separator = Span(location=train_predictions['date'].max(), dimension='height',
                           line_color='green', line_dash='dashed', line_width=2)
    test_separator = Span(location=test_predictions['date'].max(), dimension='height',
                          line_color='red', line_dash='dashed', line_width=2)
    p.add_layout(train_separator)
    p.add_layout(test_separator)

    p.legend.location = "top_left"
    p.legend.title = "Дані"

    bokeh_chart(p, use_container_width=True)

In [None]:
import streamlit as st
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from sarimax import initialize_sarimax_model, get_trained_sarimax_predictions, get_future_sarimax_predicitons
from chronos_predictions import get_chronos_future_predicitons
from chat_openai_predictions import get_openai_future_predicitons
from bokeh.plotting import figure, show
from bokeh.models import Span
from bokeh.io import output_notebook

# Initialize Spark Session
spark = SparkSession.builder.appName("ukrainian_prices").getOrCreate()

# Define constants
DATA_LOADERS = {
    "Середні ціни споживчих товарів": "average_consuming_goods_prices",
    "Базові індекси споживчих цін": "base_index_consuming_prices",
    "Індекси споживчих цін": "index_consuming_prices"
}

# Load data functions


def load_data(data_type):
    csv_file_path = f"data/processed/splited_data/{data_type}.csv"
    return (spark.read.format("csv")
            .option("header", "true")
            .schema(UKRAINE_PRICE_SCHEMA)
            .load(csv_file_path))

# Filtering functions


def filter_data(df, column, value):
    return df.filter(col(column) == value)


def select_training_data_from_df(df):
    return df.select(
        to_timestamp(col(UKRAINE_PRICE_COLUMNS.PERIOD),
                     "yyyy-'M'MM").alias("date"),
        col(UKRAINE_PRICE_COLUMNS.OBSERVATION_VALUE).alias("price"),
    ).toPandas().sort_values("date")


st.set_page_config(layout="wide")  # Streamlit UI
st.markdown(
    """
    <style>
    .streamlit-wide {
        max-width: 70%;
        margin: 0 auto;
    }
    </style>
    """,
    unsafe_allow_html=True
)
st.title("Прогнозування споживчих цін України")

# Step 1: Select data type
data_type = st.selectbox("Оберіть тип даних:", list(
    DATA_LOADERS.keys()), index=None)
if data_type:
    st.success(f"Ви обрали: {data_type}")

    # Step 2: Load and select product
    df = load_data(DATA_LOADERS[data_type])
    products = df.select(
        UKRAINE_PRICE_COLUMNS.TYPE_OF_GOODS_AND_SERVICES).distinct().toPandas()
    product_name = st.selectbox(
        "Оберіть товар:", products[UKRAINE_PRICE_COLUMNS.TYPE_OF_GOODS_AND_SERVICES], index=None)
    if product_name:
        st.success(f"Ви обрали: {product_name}")
        df = filter_data(
            df, UKRAINE_PRICE_COLUMNS.TYPE_OF_GOODS_AND_SERVICES, product_name)

        # Step 3: Select base period
        base_periods = df.select(
            UKRAINE_PRICE_COLUMNS.BASE_PERIOD).distinct().toPandas()
        base_period = st.selectbox(
            "Оберіть базисний період:", base_periods[UKRAINE_PRICE_COLUMNS.BASE_PERIOD], index=None)
        if base_period:
            st.success(f"Ви обрали: {base_period}")
            df = filter_data(
                df, UKRAINE_PRICE_COLUMNS.BASE_PERIOD, base_period)

            # Step 4: Select region
            regions = df.select(
                UKRAINE_PRICE_COLUMNS.TERRITORIAL_BREAKDOWN).distinct().toPandas()
            region = st.selectbox(
                "Оберіть регіон:", regions[UKRAINE_PRICE_COLUMNS.TERRITORIAL_BREAKDOWN], index=None)
            if region:
                st.success(f"Ви обрали: {region}")
                df = filter_data(
                    df, UKRAINE_PRICE_COLUMNS.TERRITORIAL_BREAKDOWN, region)

                # Step 5: Train model
                data = select_training_data_from_df(df)
                model_type = st.selectbox(
                    "Оберіть модель:", ["SARIMAX", 'OPENAI', "amazon/chronos-t5-large", "amazon/chronos-t5-small", "amazon/chronos-t5-tiny"], index=None)
                if model_type == "SARIMAX":
                    sarimax = initialize_sarimax_model(data)
                    train_predictions = get_trained_sarimax_predictions(
                        data, sarimax, range=(
                            data['date'].min() + pd.DateOffset(years=2, month=6), data['date'].max())
                    )
                    test_predictions = get_future_sarimax_predicitons(
                        data, sarimax, steps=36)

                    # Step 6: Visualize
                    visualize_predictions(
                        data, train_predictions, test_predictions)
                if model_type in ["amazon/chronos-t5-large", "amazon/chronos-t5-small", "amazon/chronos-t5-tiny"]:
                    test_predictions = get_chronos_future_predicitons(
                        data, model_name=model_type, months=36)
                    train_predictions = data.copy()
                    train_predictions.columns = ['date', 'mean']

                    # Step 6: Visualize
                    visualize_predictions(
                        data, train_predictions, test_predictions)
                if model_type == "OPENAI":
                    result_df, explanation = get_openai_future_predicitons(
                        data, data_type, region, product_name, 36)

                    train_predictions = data.copy()
                    train_predictions.columns = ['date', 'mean']

                    test_predictions = result_df

                    with st.chat_message(name='assistant'):
                        visualize_predictions(
                            data, train_predictions, test_predictions)
                        st.write(explanation)