In [None]:
import os
from google.cloud import bigquery
from get_bq_data import get_bq_data
from model_trainer_2 import get_predictions
from dotenv import load_dotenv

load_dotenv()

os.environ['GOOGLE_APPLICATION_CREDENTIALS' ] = 'service_keys.json'

client = bigquery. Client()

In [None]:
def aggregate_data():
    """Load dataset, aggregate, and push to Supabase""" 

    existing_data = get_bq_data(client,"county_market") 

    return existing_data

In [None]:
df = aggregate_data()
df.sort_values(by=['year', 'month', 'state_num','county_num'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

In [None]:
from preprocessing_4 import preprocess_data_4

target_df = preprocess_data_4(25, df.copy())
prediction_df = target_df.copy()

In [None]:
def train_model(df, prediction_df):

    features = [
        "median_listing_price",
        "median_days_on_market"
    ]
    
    for feature in features:
        predictions = get_predictions(df, feature, 24)
        prediction_df[feature] = predictions

    for col in features:
        if col in prediction_df.columns:
            prediction_df[col] = prediction_df[col].astype(int)

    return prediction_df

In [None]:
preds = train_model(df, prediction_df)

In [None]:
preds_2 = preds.copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [None]:
latest_df = df.groupby("county_num").tail(1).reset_index(drop=True)
preds['year_month'] = (preds['year'] - 2025) * 12 + preds['month'] - 9
preds.drop(columns=['year', 'month'], inplace=True)
cols_order = ['year_month', 'county_num', 'state_num', 'median_listing_price', 'median_days_on_market']
preds = preds[cols_order]
preds = preds[preds['year_month'].isin([3,6,12])]
preds.loc[preds['year_month'] == 12, 'median_listing_price'] = preds.loc[preds['year_month'] == 12, 'median_listing_price'] * 1.15
preds.loc[preds['year_month'] == 6, 'median_listing_price'] = preds.loc[preds['year_month'] == 6, 'median_listing_price'] * 1.1

preds['appreciation'] = 0.0
for index, row in preds.iterrows():
    county_num = row['county_num']
    
    current_price = latest_df[(latest_df['county_num'] == county_num)]['median_listing_price'].values[0]
    predicted_price = row['median_listing_price']
    
    appreciation = ((predicted_price - current_price) / current_price) * 100
    preds.loc[index, 'appreciation'] = appreciation

def calculate_volatility(df):
    volatility = {}
    for county in df['county_num'].unique():
        county_data = df[df['county_num'] == county].sort_values(by=['year', 'month'])
        if len(county_data) >= 12:
            last_12_mm = county_data['median_listing_price_mm'].tail(12).values
            volatility[county] = np.std(last_12_mm) * 100
        else:
            volatility[county] = np.nan  # Not enough data to calculate volatility
    return volatility

volatility_data = calculate_volatility(df)

preds['volatility'] = 0.0  # Initialize Volatility column

for index, row in preds.iterrows():
    county_num = row['county_num']
    if county_num in volatility_data:
        preds.loc[index, 'volatility'] = volatility_data[county_num]
    else:
        preds.loc[index, 'volatility'] = np.nan

def normalize_days_on_market(preds):
    inv_days = preds.groupby('year_month')['median_days_on_market'].transform(
        lambda s: np.where(s != 0, 1 / s, 1)
    )

    scaler = MinMaxScaler()
    preds = preds.copy()
    preds['liquidity'] = scaler.fit_transform(inv_days.to_frame()) * 100
    return preds

preds = normalize_days_on_market(preds)
preds.drop(columns=['median_listing_price', 'median_days_on_market'], inplace=True)
preds['IOI'] = (0.4 * preds['appreciation']) + (0.3 * preds['liquidity']) - (0.3 * preds['volatility'])
preds['IOI'] = preds.groupby('year_month')['IOI'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
preds['IOI'] = preds['IOI'] * 100
preds

In [None]:
preds.to_csv("county_investment_insights.csv", index=False)