In [None]:
import os
from google.cloud import bigquery
from get_bq_data import get_bq_data
from model_trainer_2 import get_predictions
from dotenv import load_dotenv

load_dotenv()

os.environ['GOOGLE_APPLICATION_CREDENTIALS' ] = 'service_keys.json'

client = bigquery. Client()

In [2]:
def aggregate_data():
    """Load dataset, aggregate, and push to Supabase""" 

    existing_data = get_bq_data(client,"county_market") 

    return existing_data

In [4]:
df = aggregate_data()
df.sort_values(by=['year', 'month', 'state_num','county_num'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,year,month,county_num,state_num,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,...,median_square_feet_yy,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy
0,2017,7,1,1,186500,0.166354,0.0005,227,16.461538,-0.2305,...,-0.0062,210063,0.131945,-0.0152,291,21.384615,-0.1961,0.2731,169.687500,0.0392
1,2017,7,2,1,317900,3.607246,0.0258,2606,371.285700,-0.1088,...,0.0248,432296,1.974603,0.0658,2698,384.428560,-0.0773,0.0361,4.308824,0.0000
2,2017,7,3,1,179900,0.149521,-0.0380,141,-0.790801,-0.0140,...,-0.0692,215424,0.086151,-0.0624,153,-0.773333,-0.0613,0.0851,55.733334,-0.0408
3,2017,7,4,1,132450,0.068145,-0.1773,44,4.500000,-0.3803,...,-0.1440,155139,-0.007828,-0.2019,49,4.444445,-0.3378,0.1136,-0.091200,0.0714
4,2017,7,5,1,162500,-0.439655,0.0841,214,-0.842299,-0.2161,...,0.0376,210885,-0.429459,0.1254,255,-0.857223,-0.1748,0.1729,-0.449188,0.0447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295906,2025,9,2985,51,345000,-0.028200,0.1129,130,0.126100,0.4885,...,0.0031,381188,-0.033000,0.0494,130,0.126100,0.4885,0.2760,0.116700,0.1316
295907,2025,9,2986,51,3478750,0.050200,0.0091,265,-0.056900,0.1991,...,0.0133,6682801,0.036100,0.0373,318,-0.006300,0.1648,0.2094,0.070600,-0.0417
295908,2025,9,2987,51,410000,0.018800,-0.0108,70,-0.054100,0.4583,...,-0.0847,481127,-0.022000,-0.0605,72,-0.027000,0.5000,0.0286,0.070600,0.0077
295909,2025,9,2988,51,238175,-0.114600,-0.1394,30,0.000000,0.0169,...,-0.1729,275345,-0.045000,-0.2328,33,-0.015200,-0.0441,0.1667,0.000000,-0.0367


In [None]:
from preprocessing_4 import preprocess_data_4

target_df = preprocess_data_4(25, df.copy())
prediction_df = target_df.copy()

In [None]:
def train_model(df, prediction_df):

    features = [
        "median_listing_price",
        "median_days_on_market"
    ]
    
    for feature in features:
        predictions = get_predictions(df, feature, 24)
        prediction_df[feature] = predictions

    for col in features:
        if col in prediction_df.columns:
            prediction_df[col] = prediction_df[col].astype(int)

    return prediction_df

In [None]:
preds = train_model(df, prediction_df)

In [None]:
preds_2 = preds.copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [None]:
latest_df = df.groupby("county_num").tail(1).reset_index(drop=True)
preds['year_month'] = (preds['year'] - 2025) * 12 + preds['month'] - 9
preds.drop(columns=['year', 'month'], inplace=True)
cols_order = ['year_month', 'county_num', 'state_num', 'median_listing_price', 'median_days_on_market']
preds = preds[cols_order]
preds = preds[preds['year_month'].isin([3,6,12])]
preds.loc[preds['year_month'] == 12, 'median_listing_price'] = preds.loc[preds['year_month'] == 12, 'median_listing_price'] * 1.15
preds.loc[preds['year_month'] == 6, 'median_listing_price'] = preds.loc[preds['year_month'] == 6, 'median_listing_price'] * 1.1

preds['appreciation'] = 0.0
for index, row in preds.iterrows():
    state_num = row['state_num']
    
    current_price = latest_df[(latest_df['state_num'] == state_num)]['median_listing_price'].values[0]
    predicted_price = row['median_listing_price']
    
    appreciation = ((predicted_price - current_price) / current_price) * 100
    preds.loc[index, 'appreciation'] = appreciation

def calculate_volatility(df):
    volatility = {}
    for state in df['state_num'].unique():
        state_data = df[df['state_num'] == state].sort_values(by=['year', 'month'])
        if len(state_data) >= 12:
            last_12_mm = state_data['median_listing_price_mm'].tail(12).values
            volatility[state] = np.std(last_12_mm) * 100
        else:
            volatility[state] = np.nan  # Not enough data to calculate volatility
    return volatility

volatility_data = calculate_volatility(df)

preds['volatility'] = 0.0  # Initialize Volatility column

for index, row in preds.iterrows():
    state_num = row['state_num']
    if state_num in volatility_data:
        preds.loc[index, 'volatility'] = volatility_data[state_num]
    else:
        preds.loc[index, 'volatility'] = np.nan

def normalize_days_on_market(preds):
    inv_days = preds.groupby('year_month')['median_days_on_market'].transform(
        lambda s: np.where(s != 0, 1 / s, 1)
    )

    scaler = MinMaxScaler()
    preds = preds.copy()
    preds['liquidity'] = scaler.fit_transform(inv_days.to_frame()) * 100
    return preds

preds = normalize_days_on_market(preds)