In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import duckdb
import csv
import os
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
models_path = '/content/drive/MyDrive/ADSDB/colabs-part2/Model Generation/models.csv'        # Path where models are saved.
input_path = '/content/drive/MyDrive/ADSDB/colabs-part2/Model Execution/input.txt'
output_path = '/content/drive/MyDrive/ADSDB/colabs-part2/Model Execution/report.txt'

# Save and Load functions

In [4]:
def save_model(p, d, q, mae, mape, rmse, train_range, predict_range, path, mode):
  model_name = 'Arima' + '-' + str(mode) + '(' + str(p) + ',' + str(d) + ',' + str(q) + ') ' + str(train_range) + str(predict_range)

  if not os.path.exists(path):
        with open(path, mode='w', newline='') as new_csv_file:
            fieldnames = ["model_name", "p", "d", "q", "mae", "mape", "rmse", "train_range", "predict_range", "mode"]
            writer = csv.DictWriter(new_csv_file, fieldnames=fieldnames)
            writer.writeheader()

  # Check if the entry already exists in the CSV file
  entry_exists = False
  with open(path, mode='r', newline='') as csv_file:
      reader = csv.DictReader(csv_file)
      for row in reader:
          if row["model_name"] == model_name:
              entry_exists = True
              break

   # If the entry doesn't exist, add it to the CSV file
  if not entry_exists:
    with open(path, mode='a', newline='') as csv_file:
      fieldnames = ["model_name", "p", "d", "q", "mae", "mape", "rmse", "train_range", "predict_range", "mode"]
      writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
      # Check if the file is empty, and write the header if it is
      if csv_file.tell() == 0:
        writer.writeheader()
        # Write model information to the CSV file
      writer.writerow({
          "model_name": model_name,
          #"seed": seed,
          "p": p,
          "d": d,
          "q": q,
          "mae": mae,
          "mape": mape,
          "rmse": rmse,
          "train_range": train_range,
          "predict_range": predict_range,
          "mode": mode
          })
    print(f"Model and information saved to {path}")
  else:
    print(f"Model information already exists in {path}, not adding a duplicate entry.")


def load_model(path, model_name):
  try:
      with open(path, mode='r') as csv_file:
          reader = csv.DictReader(csv_file)

          for row in reader:
              if row["model_name"] == model_name:
                  return {
                      "p": int(row["p"]),
                      "d": int(row["d"]),
                      "q": int(row["q"]),
                      #"seed": np.uint32(int(row["seed"])),
                      "mae": float(row["mae"]),
                      "mape": float(row["mape"]),
                      "rmse": float(row["rmse"]),
                      "train_range": str(row["train_range"]),
                      "predict_range": str(row["predict_range"]),
                      "mode": row["mode"]
                  }

      print(f"Model with name '{model_name}' not found in {path}")
      return None
  except FileNotFoundError:
        print(f"File {path} not found.")
        return None

# Retrieve information

In [5]:
with open(input_path, 'r') as file:
    file_content = file.read()

In [6]:
import re

match = re.search(r'starting = (\d{4}Q[1-4])\nending = (\d{4}Q[1-4])', file_content)

if match:
    starting_quarter = match.group(1)
    ending_quarter = match.group(2)
    print(f"Starting Quarter: {starting_quarter}")
    print(f"Ending Quarter: {ending_quarter}")
else:
    print("Pattern not found in the file.")

Starting Quarter: 2022Q1
Ending Quarter: 2022Q4


In [7]:
conn_input = duckdb.connect(database='/content/drive/MyDrive/ADSDB/colabs-part2/Data Preparation/prepared_data.db', read_only=True)
df = conn_input.execute("SELECT * FROM prepared_data").df()

# Load Model

In [8]:
#load model
model_name = 'Arima-auto(1,0,4) 2008Q1-2021Q42022Q1-2022Q4'
load = load_model(models_path,model_name)
p,d,q = load["p"], load["d"], load["q"]
print(p,d,q)


1 0 4


# Execute

In [9]:
def calculate_quarters_between(start_date, end_date):
    quarters = {'Q1': 1, 'Q2': 2, 'Q3': 3, 'Q4': 4}

    start_year, start_quarter = int(start_date[:4]), start_date[4:]
    end_year, end_quarter = int(end_date[:4]), end_date[4:]

    quarters_between = (end_year - start_year) * 4 + quarters[end_quarter] - quarters[start_quarter] + 1

    return quarters_between

# Example usage
#start_date = '2022Q1'
#end_date = '2022Q4'

#result = calculate_quarters_between(start_date, end_date)
#print(f"The number of quarters between {start_date} and {end_date} is: {result}")

In [10]:
n_predictions = calculate_quarters_between(starting_quarter, ending_quarter)
print(n_predictions)

4


In [11]:
def is_before_2022Q4(date_string):
    # Extract year and quarter from the input string
    year = int(date_string[:4])
    quarter = date_string[-1]

    # Map quarter strings to corresponding numerical values
    quarter_mapping = {'Q1': 1, 'Q2': 2, 'Q3': 3, 'Q4': 4}

    # Convert the quarter string to a numerical value
    quarter_numeric = quarter_mapping.get(quarter, -1)

    # Check if the date is before 2022Q4
    return (year < 2022) or (year == 2022 and quarter_numeric < 4)

from datetime import datetime

def calculate_quarter_difference(input_quarter):
    # Convert the input quarter to a datetime object
    input_date = datetime.strptime(input_quarter, '%YQ%m')

    # Define the reference date for 2022Q4
    reference_date = datetime(2022, 12, 31)

    # Calculate the difference in quarters
    quarter_difference = (reference_date.year - input_date.year) * 4 + (reference_date.month - input_date.month) // 3

    return quarter_difference + 1

n = calculate_quarter_difference(starting_quarter)
print(n)

if (is_before_2022Q4(starting_quarter)):
    df = df.iloc[n:]
else:
    print("after")


4


In [12]:
df.head()

Unnamed: 0,Quarter,house_price_index
4,2021-10-01,135.291
5,2021-07-01,133.652
6,2021-04-01,130.937
7,2021-01-01,127.831
8,2020-10-01,127.179


In [13]:
if 'Quarter' in df.index.names:
    print("'Quarter' is already the index.")
else:
    df.set_index('Quarter', inplace=True)

In [14]:
model = ARIMA(df, order=(p, d, q))
model_fit = model.fit()
forecast = model_fit.forecast(n_predictions)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  return get_prediction_index(
  return get_prediction_index(


In [15]:
print(forecast)

56    151.700491
57    151.890638
58    151.053561
59    149.821118
Name: predicted_mean, dtype: float64


In [22]:
predicted_range = starting_quarter + '-' + ending_quarter

# Visualize

In [None]:
#total_set.plot()

# Report

In [None]:
# Write the report to a file

# Create the report text
report_text = f"Model Used: {model_name}\n"
report_text += f"Predicted Range: {predicted_range}\n"
report_text += f"Predicted Values: {forecast}\n"

with open(output_path, 'w') as file:
  file.write(report_text)
