# Welcome to the Lab 🥼🧪

## Rental Mix Analysis

Why does rental housing mix matter? The types of units trading varies by market and can provide insight into where rental prices are going. This analysis will look at the rental mix of a market. The [Parcl Labs Rental Price Feeds](https://www.parcllabs.com/articles/parcl-labs-rental-price-feed-white-paper) is the rental price per square foot of units trading on a market. Understanding the mix and the variation in mix over time can provide insight into where prices could go. 

**Note** This notebook will work with any of the 70k+ markets supported by the Parcl Labs API.

As a reminder, you can get your Parcl Labs API key [here](https://dashboard.parcllabs.com/signup) to follow along. 

To run this immediately, you can use Google Colab. Remember, you must set your `PARCL_LABS_API_KEY` as a secret. See this [guide](https://medium.com/@parthdasawant/how-to-use-secrets-in-google-colab-450c38e3ec75) for more information.

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ParclLabs/parcllabs-examples/blob/main/python/traders/rental_mix_analysis.ipynb)

In [None]:
# Environment setup
import os
import sys
import subprocess
from datetime import datetime

# Collab setup from one click above
if "google.colab" in sys.modules:
    from google.colab import userdata
    %pip install parcllabs plotly kaleido numpy
    !git clone https://github.com/ParclLabs/parcllabs-examples.git
    sys.path.append('/content/parcllabs-examples/python/')
    api_key = userdata.get('PARCL_LABS_API_KEY')
else:
    api_key = os.getenv('PARCL_LABS_API_KEY')
    cur_dir = os.getcwd()
    chart_dir = os.path.join(cur_dir, '..')
    sys.path.append(chart_dir)

In [None]:
import parcllabs
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from parcllabs import ParclLabsClient
from charting.utils import create_labs_logo_dict, format_metro_names

print(f"Parcl Labs Version: {parcllabs.__version__}")

In [None]:
# init client
client = ParclLabsClient(api_key=api_key)

In [None]:
# Get top 100 CBSAs by population
markets = client.search_markets.retrieve(
    as_dataframe=True,
    # sort_by='PARCL_EXCHANGE_MARKET',
    sort_by='PRICEFEED_MARKET',
    sort_order='DESC',
    params={
        'limit': 100
    }
)

markets = markets.loc[markets['pricefeed_market']==1]

In [None]:
def clean_names(nme):
    replace = {
        'Washington City': 'D.C.',
        'United States Of America': 'USA',
        'New York City': 'NYC',
        'Kings County': 'Brooklyn, NY',
    }
    if nme in replace.keys():
        return replace[nme]
    else:
        return nme
    
markets['name'] = markets['name'].apply(clean_names)
markets

In [None]:
START_DATE = '2020-01-01'

rentals = client.rental_price_feed.retrieve_many(
        parcl_ids=markets['parcl_id'].tolist(),
        start_date=START_DATE,
        as_dataframe=True,
        params={'limit': 1000},  # expand the limit to 1000, these are daily series
        auto_paginate=True, # auto paginate to get all the data - WARNING: ~6k credits can be used in one parcl price feed. Change the START_DATE to a more recent date to reduce the number of credits used
)

rentals.head()

In [None]:
rentals = rentals.merge(markets[['name', 'parcl_id']], on='parcl_id', how='inner')

In [None]:
# get the current housing stock
supply_sfh = client.rental_market_metrics_new_listings_for_rent_rolling_counts.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    property_type='SINGLE_FAMILY',
    params={
        'limit': 300 # get current state
    }
)

# get the current housing stock
supply_all = client.rental_market_metrics_new_listings_for_rent_rolling_counts.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    property_type='ALL_PROPERTIES',
    params={
        'limit': 300 # get current state
    }
)

# get the current housing stock
supply_condos = client.rental_market_metrics_new_listings_for_rent_rolling_counts.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    property_type='CONDO',
    params={
        'limit': 300 # get current state
    }
)

supply_townhomes = client.rental_market_metrics_new_listings_for_rent_rolling_counts.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    property_type='TOWNHOUSE',
    params={
        'limit': 300 # get current state
    }
)

prices = client.investor_metrics_housing_event_prices.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    params={
        'limit': 300
    }
)

investors = client.investor_metrics_housing_event_counts.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    params={
        'limit': 300
    }
)

yields = client.rental_market_metrics_gross_yield.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    property_type='ALL_PROPERTIES',
    as_dataframe=True,
    start_date=START_DATE,
    params={
        'limit': 300
    }
)

housing_prices = client.market_metrics_housing_event_prices.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    params={
        'limit': 300
    }
)


In [None]:
investor_ownership = client.investor_metrics_housing_stock_ownership.retrieve_many(
    parcl_ids=rentals['parcl_id'].unique().tolist(),
    as_dataframe=True,
    start_date=START_DATE,
    params={
        'limit': 300
    }
)

In [None]:
supply_all['date'] = pd.to_datetime(supply_all['date'])
supply_all['month_start'] = supply_all['date'].dt.to_period('M').dt.to_timestamp()
supply_agg = supply_all.groupby(['parcl_id', 'month_start'])['rolling_30_day'].mean().reset_index()
supply_agg = supply_agg.rename(columns={'month_start': 'date'})
concentration['date'] = pd.to_datetime(concentration['date'])
supply_ml = supply_agg.merge(concentration[['date', 'rental_units', 'parcl_id']], on=['date', 'parcl_id'], how='inner')
supply_ml.head()

In [None]:
supply_ml['vacancy'] = supply_ml['rolling_30_day'] / supply_ml['rental_units']
supply_ml.loc[supply_ml['parcl_id'] == 5826765]

In [None]:
supply_all = supply_all.rename(columns={'rolling_30_day': 'rolling_30_day_all'})
supply_sfh = supply_sfh.rename(columns={'rolling_30_day': 'rolling_30_day_sfh'})
supply_condos = supply_condos.rename(columns={'rolling_30_day': 'rolling_30_day_condos'})
supply_townhomes = supply_townhomes.rename(columns={'rolling_30_day': 'rolling_30_day_townhomes'})
supply = supply_all.merge(supply_sfh[['parcl_id', 'date', 'rolling_30_day_sfh']], on=['parcl_id', 'date'], how='inner')
supply = supply.merge(supply_condos[['parcl_id', 'date', 'rolling_30_day_condos']], on=['parcl_id', 'date'], how='inner')
supply = supply.merge(supply_townhomes[['parcl_id', 'date', 'rolling_30_day_townhomes']], on=['parcl_id', 'date'], how='inner')
supply = supply[['date', 'parcl_id', 'rolling_30_day_all', 'rolling_30_day_sfh', 'rolling_30_day_condos', 'rolling_30_day_townhomes']]

In [None]:
supply['rolling_30_day_other'] = supply['rolling_30_day_all'] - supply['rolling_30_day_sfh'] - supply['rolling_30_day_condos'] - supply['rolling_30_day_townhomes']
# supply['rolling_30_day_other'] = supply['rolling_30_day_other'].clip(lower=0)
supply.head()

In [None]:
supply['pct_single_family'] = supply['rolling_30_day_sfh'] / supply['rolling_30_day_all'] 
supply['pct_condo'] = supply['rolling_30_day_condos'] / supply['rolling_30_day_all']
supply['pct_townhouse'] = supply['rolling_30_day_townhomes'] / supply['rolling_30_day_all']
supply['pct_other'] = supply['rolling_30_day_other'] / supply['rolling_30_day_all']

pie = supply.groupby('parcl_id')[['rolling_30_day_all', 'rolling_30_day_sfh', 'rolling_30_day_condos', 'rolling_30_day_townhomes', 'rolling_30_day_other']].sum().reset_index()
pie['pct_single_family'] = pie['rolling_30_day_sfh'] / pie['rolling_30_day_all'] 
pie['pct_condo'] = pie['rolling_30_day_condos'] / pie['rolling_30_day_all']
pie['pct_townhouse'] = pie['rolling_30_day_townhomes'] / pie['rolling_30_day_all']
pie['pct_other'] = pie['rolling_30_day_other'] / pie['rolling_30_day_all']

In [None]:
pie.head()

In [None]:
# join name
df = pie.merge(markets[['name', 'parcl_id']], on='parcl_id')
df['name'] = df['name'].apply(lambda x: format_metro_names(x, include_state=True))


In [None]:
import plotly.graph_objects as go

def build_pie_chart(data, market_name: str, save_path: str = None):
    # Prepare data for the pie chart
    labels = ['Single Family', 'Condo', 'Townhouse', 'Other']
    values = [data['pct_single_family'].values[0], data['pct_condo'].values[0], data['pct_townhouse'].values[0], data['pct_other'].values[0]]

    # Create pie chart
    fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        textinfo='label+percent',
        textposition='outside',
        pull=[0.05, 0.05, 0.05, 0.05],
        marker=dict(
            colors=['#636EFA', '#EF553B', '#00CC96', '#AB63FA'],
            line=dict(color='#FFFFFF', width=2)
        ),
        insidetextorientation='radial'
    )])

    # load assets
    labs_logo_dict = create_labs_logo_dict(
        src='labs',
        y=0,
        x=1,
        sizex=0.2,
        sizey=0.2,
        xanchor="right",
        yanchor="bottom"
    )

    # Add Parcl Labs logo
    fig.add_layout_image(
        labs_logo_dict
    )

    # Add title, subtitle, and layout adjustments
    fig.update_layout(
        margin=dict(l=10, r=10, t=150, b=10),
        title={
            'text': f'Rental Market Supply Mix: {market_name}',
            'font': {'size': 28},
            'x': 0.5,
            'y': 0.95,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        annotations=[
            dict(
                text=f'Average Rental Market Supply Breakdown Since `20',
                x=0.5,
                y=1.12,
                showarrow=False,
                font=dict(size=18, color='white'),
                xanchor='center'
            )
        ],
        showlegend=False,
        plot_bgcolor='#222222',
        paper_bgcolor='#222222',
        font=dict(color='white'),
    )

    # Add hover info
    fig.update_traces(
        hoverinfo='label+percent+value',
        textfont_size=14
    )

    # Adjust figure size
    fig.update_layout(
        autosize=False,
        width=800,
        height=800
    )

    # Save the figure
    if save_path:
        fig.write_image(save_path)

    # Show the figure
    fig.show()


In [None]:
name = 'Denver City, DENVER CITY'
pie_chart = df.loc[df['name'] == name]
build_pie_chart(
        data=pie_chart,
        market_name='Denver City',
        # save_path=f'../graphics/housing_mix_analysis/{name}_housing_stock_mix.png'
    )

In [None]:
# Define the function to build the line chart
def build_line_chart(
    data: pd.DataFrame,
    title: str = None,
    x_axis_title: str = None,
    y_axis_title: str = None,
    save_path: str = None,
    value_name: str = None
):
    HEIGHT = 900
    WIDTH = 1600
    
    fig = go.Figure()

    # Define colors for each property type
    colors = {
        'Single Family': '#1f77b4',  # Blue
        'All Other Units': '#ff7f0e',  # Orange
        'Townhouse': '#2ca02c'       # Green
    }

    # Plot each property type
    for property_type in data['property_type'].unique():
        subset = data[data['property_type'] == property_type]
        fig.add_trace(go.Scatter(
            x=subset['date'],
            y=subset[value_name]*100,
            mode='lines+markers',
            name=property_type,
            line=dict(width=3, color=colors.get(property_type, '#FFFFFF')),  # Default to white if no color found
            marker=dict(size=8)
        ))

    # Add vertical dotted line
    fig.add_shape(
        dict(
            type="line",
            x0="2020-03-01",
            y0=0,
            x1="2020-03-01",
            y1=1,
            xref='x',
            yref='paper',
            line=dict(
                color="Red",
                width=2,
                dash="dot",
            )
        )
    )

    # Add annotation for the vertical line
    fig.add_annotation(
        dict(
            x="2020-03-01",
            y=1,
            xref='x',
            yref='paper',
            xanchor='left',  # Position the text to the right of the line
            xshift=10,  # Shift the text slightly to the right
            text="COVID Starts",
            showarrow=False,
            font=dict(
                size=14,
                color="Red"
            ),
            align="center"
        )
    )

    # Add logo image
    fig.add_layout_image(
        labs_logo_dict
    )
    
    fig.update_layout(
        margin=dict(l=100, r=100, t=150, b=100),
        height=HEIGHT,
        width=WIDTH,
        title={
            'text': title,
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(size=28, color='#FFFFFF'),
        },
        plot_bgcolor='#222222',  # Dark background for better contrast
        paper_bgcolor='#222222',  # Dark background for the paper
        font=dict(color='#FFFFFF'),
        xaxis=dict(
            title_text=x_axis_title,
            showgrid=True,
            gridwidth=0.5,  # Vertical grid line width
            gridcolor='rgba(255, 255, 255, 0.1)',  # Vertical grid line color with opacity
            tickangle=-45,
            tickfont=dict(size=14),
            linecolor='rgba(255, 255, 255, 0.7)',  # Axis line color with opacity
            linewidth=1  # Axis line width
        ),
        yaxis=dict(
            title_text=y_axis_title,
            showgrid=True,
            gridwidth=0.5,  # Horizontal grid line width
            gridcolor='rgba(255, 255, 255, 0.2)',  # Horizontal grid line color with opacity
            tickfont=dict(size=14),
            tickprefix='',  # Remove dollar sign prefix
            ticksuffix='%',  # Add percentage suffix
            zeroline=False,
            linecolor='rgba(255, 255, 255, 0.7)',  # Axis line color with opacity
            linewidth=1  # Axis line width
        ),
        hovermode='x unified',  # Unified hover mode for better interactivity
        hoverlabel=dict(
            bgcolor='#333333',
            font_size=14,
            font_family="Rockwell"
        ),
        legend=dict(
            x=0.95,  # Position legend in the top right corner
            y=0.95,
            xanchor='right',
            yanchor='top',
            font=dict(size=14, color='#FFFFFF'),
            bgcolor='rgba(0, 0, 0, 0.5)'
        )
    )

    if save_path:
        fig.write_image(save_path, width=WIDTH, height=HEIGHT)
    
    # Show the plot
    fig.show()


# load assets
labs_logo_dict = create_labs_logo_dict(
    src='labs',
    y=1.03,
    sizex=0.15,
    sizey=0.15,
    x=0.47
)

In [None]:
supply['pct_all_other'] = 1-supply['pct_single_family']

# Prepare data for the line chart
supply_other = supply[['parcl_id', 'date', 'pct_all_other']]
supply_other = supply_other.rename(columns={'pct_all_other': 'pct_supply'})  
supply_other['property_type'] = 'All Other Units'
supply_sfh = supply[['parcl_id', 'date', 'pct_single_family']]
supply_sfh['property_type'] = 'Single Family'
supply_sfh = supply_sfh.rename(columns={'pct_single_family': 'pct_supply'})
ratios = pd.concat([supply_other, supply_sfh])
ratios.head()

In [None]:
pid = 5826765
name = 'USA'
build_line_chart(
        data=ratios.loc[ratios['parcl_id']==pid],
        title=f'{name} Rental Supply Mix: Single Family vs. All Other Units',
        x_axis_title='',
        y_axis_title='Percentage of Rental Supply',
        value_name='pct_supply',
        # save_path=f'../graphics/housing_mix_analysis/{name}_home_sales_mix.png'
)

In [None]:


investors.head()

In [None]:
yields.head()

In [None]:
# Define the function to build the dual-axis chart with a stacked bar
def demand_vs_pf_chart(
    market_name: str, 
    data: pd.DataFrame,
    sales_data: pd.DataFrame,
    price_series: str = 'price_feed',
    single_family_series: str = 'sales',
    all_other_units_series: str = 'all_other_sales',
    save_path: str = None,
    title: str = None,
    yaxis1_title: str = None,
    yaxis2_title: str = None
):

    HEIGHT = 900
    WIDTH = 1600
    
    fig = go.Figure()

    # Add bar trace for Single Family Purchases first (bottom of the stack)
    fig.add_trace(go.Bar(
        x=sales_data['date'],
        y=sales_data[single_family_series],
        marker=dict(color='#636EFA', opacity=0.8),
        name='Acquisitions',
        yaxis='y2'
    ))

    # Add bar trace for All Other Unit Purchases second (top of the stack)
    fig.add_trace(go.Bar(
        x=sales_data['date'],
        y=sales_data[all_other_units_series],
        marker=dict(color='#EF553B', opacity=0.8),
        name='Dispositions',
        yaxis='y2'
    ))

    # Add primary y-axis trace for price series
    fig.add_trace(go.Scatter(
        x=data['date'],
        y=data[price_series],
        mode='lines+markers',
        line=dict(width=4, color='#00CC96'),  # Solid green color for price series
        marker=dict(size=8),
        name=yaxis1_title
    ))

    # Reorder traces so the line is in front
    fig.data = fig.data[::-1]

    # Add logo image (assuming `labs_logo_dict` is a dictionary containing the image details)
    fig.add_layout_image(
        dict(
            source="https://yourlogo.url/logo.png",  # Replace with your logo URL
            xref="paper", yref="paper",
            x=0.5, y=1.15,
            sizex=0.2, sizey=0.2,
            xanchor="center", yanchor="top"
        )
    )
    
    fig.update_layout(
        margin=dict(l=100, r=100, t=150, b=100),
        height=HEIGHT,
        width=WIDTH,
        title={
            'text': title,
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(size=28, color='#FFFFFF'),
        },
        plot_bgcolor='#1e1e1e',
        paper_bgcolor='#1e1e1e',
        font=dict(color='#FFFFFF'),
        xaxis=dict(
            title_text='',
            showgrid=True,
            gridwidth=1,
            gridcolor='rgba(255, 255, 255, 0.2)',
            tickangle=-45,
            tickfont=dict(size=14),
            linecolor='rgba(255, 255, 255, 0.7)',
            linewidth=2
        ),
        yaxis=dict(
            title_text=yaxis1_title,
            showgrid=True,
            gridwidth=1,
            gridcolor='rgba(255, 255, 255, 0.3)',
            tickfont=dict(size=14),
            tickprefix='$',
            zeroline=False,
            linecolor='rgba(255, 255, 255, 0.7)',
            linewidth=2
        ),
        yaxis2=dict(
            title_text=yaxis2_title,
            showgrid=True,
            gridwidth=1,
            gridcolor='rgba(255, 255, 255, 0.3)',
            tickfont=dict(size=14),
            zeroline=False,
            linecolor='rgba(255, 255, 255, 0.7)',
            linewidth=2,
            overlaying='y',
            side='right',
            ticksuffix=' units'
        ),
        hovermode='x unified',
        hoverlabel=dict(
            bgcolor='#333333',
            font_size=14,
            font_family="Rockwell"
        ),
        legend=dict(
            x=0.95,  # Position legend in the top right corner
            y=0.95,
            xanchor='right',
            yanchor='top',
            font=dict(size=14, color='#FFFFFF'),
            bgcolor='rgba(0, 0, 0, 0.5)'
        ),
        barmode='stack'  # Stack the bars
    )

    if save_path:
        fig.write_image(save_path, width=WIDTH, height=HEIGHT)
    fig.show()

pid = 5387853
name = markets.loc[markets['parcl_id'] == pid, 'name'].values[0]
supply['rolling_30_day_all_other'] = supply['rolling_30_day_all'] - supply['rolling_30_day_sfh']
pf = rentals.loc[(rentals['parcl_id'] == pid) & (rentals['date'] >= '2020-01-01')]
# counts_data = supply.loc[(supply['parcl_id'] == pid) & (supply['date'] >= '2023-01-01')]
# counts_data.head()
counts_data = investors.loc[(investors['parcl_id'] == pid) & (investors['date'] >= '2020-01-01')]

# Example call to the function
demand_vs_pf_chart(
    name, 
    yields.loc[yields['parcl_id'] == pid], 
    # yields.loc[yields['parcl_id'] == pid],
    counts_data,
    price_series='pct_gross_yield',
    single_family_series='acquisitions',
    all_other_units_series='dispositions',
    yaxis1_title='Rental Price per Square Foot ($)',
    yaxis2_title='Total Number of Supply',
    title=f'{name} Rentals vs. Total Number of Rental Supply by Property Type',
    # save_path=f'../graphics/housing_mix_analysis/{name}_purchases_by_pf.png'
)

In [None]:
prices.head()

In [None]:
investor_ownership = investor_ownership.rename(columns={'count': 'investor_owner_count', 'pct_ownership': 'investor_pct_ownership'})
investor_ownership.columns.tolist()

In [None]:
prices.head()

In [None]:
investors['net'] = investors['acquisitions'] - investors['dispositions']
housing_prices = housing_prices.rename(columns={'price_median_sales': 'mkt_price_median_sales', 'price_median_new_listings_for_sale': 'mkt_price_median_new_listings_for_sale'})
tmp = yields.merge(investors, on=['parcl_id', 'date'])
tmp = tmp.merge(prices, on=['parcl_id', 'date'])
tmp = tmp.merge(investor_ownership, on=['parcl_id', 'date'])
tmp = tmp.merge(housing_prices[['mkt_price_median_sales', 'mkt_price_median_new_listings_for_sale', 'date', 'parcl_id']], on=['parcl_id', 'date'])
# tmp = tmp.merge(supply, on=['parcl_id', 'date'])
# tmp[['pct_gross_yield', 'net']].corr()

# Ensure 'date' column is in datetime format
rentals['date'] = pd.to_datetime(rentals['date'])

# Truncate date to the first of the month
rentals['month_start'] = rentals['date'].dt.to_period('M').dt.to_timestamp()
rentals.head()

agg = rentals.groupby(['parcl_id', 'month_start'])['rental_price_feed'].mean().reset_index()
agg = agg.rename(columns={'month_start': 'date'})
tmp['date'] = pd.to_datetime(tmp['date'])
tmp = tmp.merge(agg, on=['parcl_id', 'date'], how='inner')
tmp = tmp.sort_values('date')
tmp['pf_shift'] = tmp.groupby('parcl_id')['rental_price_feed'].shift(11)
tmp = tmp.dropna()
tmp.head()

In [None]:
tmp[[
    'pct_gross_yield', 
    'net', 
    'pf_shift', 
    'rental_price_feed', 
    'acquisitions', 
    'dispositions', 
    'new_rental_listings', 
    'new_listings_for_sale',
     'price_median_acquisitions',
 'price_median_dispositions',
 'price_median_new_listings_for_sale',
 'price_median_new_rental_listings',
 'price_per_square_foot_median_acquisitions',
 'price_per_square_foot_median_dispositions',
 'price_per_square_foot_median_new_listings_for_sale',
 'price_per_square_foot_median_new_rental_listings',
 'mkt_price_median_sales', 'mkt_price_median_new_listings_for_sale',
 'investor_owner_count', 'investor_pct_ownership'
]].corr()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import seaborn as sns

# Assume tmp is your DataFrame
tmp['date'] = pd.to_datetime(tmp['date'])
tmp['year'] = tmp['date'].dt.year
tmp['month'] = tmp['date'].dt.month
tmp['parcl_id'] = tmp['parcl_id'].astype('category')

# Define numerical features
numerical_features = [
    'pct_gross_yield', 
    'acquisitions', 
    'net',
    'new_listings_for_sale',
    'price_median_acquisitions', 
    'price_median_dispositions',
    'price_per_square_foot_median_new_listings_for_sale',
    # 'price_median_new_rental_listings',
    'mkt_price_median_sales',#  'mkt_price_median_new_listings_for_sale'
    'investor_pct_ownership'
]
categorical_features = ['year', 'month']

# Exploratory Data Analysis (EDA)
plt.figure(figsize=(12, 6))
sns.heatmap(tmp[numerical_features+['pf_shift']].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

# Checking multicollinearity using VIF
X_vif = tmp[numerical_features].dropna()

# Apply RobustScaler to handle outliers and extreme values
scaler = RobustScaler()
X_vif_scaled = scaler.fit_transform(X_vif)

vif_data = pd.DataFrame()
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif_scaled, i) for i in range(len(X_vif.columns))]
print(vif_data)

# Creating train-test split
X = tmp[numerical_features + categorical_features]
y = tmp['pf_shift']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

# Ridge Regression Model
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])
ridge_model.fit(X_train, y_train)

# Predict
y_ridge_pred = ridge_model.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_ridge_pred)
mse = mean_squared_error(y_test, y_ridge_pred)
ridge_r2 = r2_score(y_test, y_ridge_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {ridge_r2}')

# Cross-validation
cv_scores = cross_val_score(ridge_model, X, y, cv=TimeSeriesSplit(n_splits=5), scoring='r2')
print(f'Cross-validated R2 scores: {cv_scores}')
print(f'Mean cross-validated R2 score: {np.mean(cv_scores)}')

# Feature importance
feature_names = numerical_features + list(ridge_model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))
ridge_coefs = ridge_model.named_steps['regressor'].coef_

feature_importance = pd.Series(ridge_coefs, index=feature_names).sort_values(ascending=False)
plt.figure(figsize=(12, 8))
feature_importance.plot(kind='bar')
plt.title('Feature Importance - Ridge Regression')
plt.show()

# Residual analysis
residuals = y_test - y_ridge_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_ridge_pred, residuals)
plt.axhline(0, color='r', linestyle='--')
plt.title('Residuals vs Predicted')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.show()


In [None]:
X_train

In [None]:
X_train

In [None]:
import plotly.graph_objects as go
import pandas as pd





def create_housing_mix_tweet(
    name: str,
    pie_chart: pd.DataFrame
):
    pct_single_family = pie_chart['pct_single_family'].values[0]
    pct_condo = pie_chart['pct_condo'].values[0]
    pct_other  = pie_chart['pct_other'].values[0]
    pct_townhouse = pie_chart['pct_townhouse'].values[0]
    housing_mix_tweet = f"""
    {name} Current Housing Mix

    Single Family: {pct_single_family:.2%}
    Condo: {pct_condo:.2%}
    Townhouse: {pct_townhouse:.2%}
    Other: {pct_other:.2%}
    """

    return housing_mix_tweet

def create_sfh_sales_mix_tweet(
    name: str,
    line_chart_data: pd.DataFrame
):
    line_chart_data = line_chart_data.loc[line_chart_data['property_type'] == 'Single Family'].sort_values('date')
    current_sfh_sale_ratio = line_chart_data['pct_sales'].values[-1]
    pct_change_sfh_sale_ratio_6_mo = (line_chart_data['pct_sales'].values[-1] - line_chart_data['pct_sales'].values[-6])/line_chart_data['pct_sales'].values[-6]
    pct_change_sfh_sale_ratio_yoy = (line_chart_data['pct_sales'].values[-1] - line_chart_data['pct_sales'].values[-12])/line_chart_data['pct_sales'].values[-12]
    pct_change_sfh_sale_ratio_19 = (line_chart_data['pct_sales'].values[-1] - line_chart_data['pct_sales'].values[0])/line_chart_data['pct_sales'].values[0]

    sfh_sales_tweet = f"""
    {name} SFH vs. All Other Unit Sales

    Current SFH Sale %: {current_sfh_sale_ratio:.2%}
    % Change (6 mo): {pct_change_sfh_sale_ratio_6_mo:.2%}
    % Change (YoY): {pct_change_sfh_sale_ratio_yoy:.2%}
    % Change (Since `19): {pct_change_sfh_sale_ratio_19:.2%}
    """

    return sfh_sales_tweet

def create_sales_vol_pf_tweet(
        name: str,
        pf: pd.DataFrame, 
        counts_data: pd.DataFrame
):
    # daily series
    pf = pf.sort_values('date')
    pf_pct_change_30_day = (pf['price_feed'].values[-1]-pf['price_feed'].values[-30])/pf['price_feed'].values[-30]
    pf_pct_change_6_mo = (pf['price_feed'].values[-1]-pf['price_feed'].values[-180])/pf['price_feed'].values[-180]
    pf_pct_change_yoy = (pf['price_feed'].values[-1]-pf['price_feed'].values[-365])/pf['price_feed'].values[-365]
    pf_pct_change_since_2019 = (pf['price_feed'].values[-1]-pf['price_feed'].values[0])/pf['price_feed'].values[0]

    # monthly series
    demand_sfh = counts_data.sort_values('date')
    demand_sfh_pct_change_30_day = (demand_sfh['sales'].values[-1]-demand_sfh['sales'].values[-2])/demand_sfh['sales'].values[-2]
    demand_sfh_pct_change_6_mo = (demand_sfh['sales'].values[-1]-demand_sfh['sales'].values[-6])/demand_sfh['sales'].values[-6]
    demand_sfh_pct_change_yoy = (demand_sfh['sales'].values[-1]-demand_sfh['sales'].values[-12])/demand_sfh['sales'].values[-12]
    demand_sfh_pct_change_since_2019 = (demand_sfh['sales'].values[-1]-demand_sfh['sales'].values[0])/demand_sfh['sales'].values[0]

    sales_volume_pf_tweet = f"""
    {name} Sales Volume vs. PPSQF

    % Change (30 Day)
    PPSQF: {pf_pct_change_30_day:.2%}
    SFH Sales: {demand_sfh_pct_change_30_day:.2%}

    % Change (6 Mo)
    PPSQF: {pf_pct_change_6_mo:.2%}
    SFH Sales: {demand_sfh_pct_change_6_mo:.2%}

    % Change (YoY)
    PPSQF: {pf_pct_change_yoy:.2%}
    SFH Sales: {demand_sfh_pct_change_yoy:.2%}

    % Change (Since `19)
    PPSQF: {pf_pct_change_since_2019:.2%}
    SFH Sales: {demand_sfh_pct_change_since_2019:.2%}
    """
    return sales_volume_pf_tweet


In [None]:
pid = 5374321

pf = feeds.loc[feeds['parcl_id'] == pid]
pie_chart = df.loc[(df['parcl_id']==pid) & (df['date']=='2024-04-01')]
line_chart_data = chart.loc[chart['parcl_id'] == pid]
counts_data = line_chart_data.copy(deep=True)
counts_data['all_other_sales'] = counts_data['all_sales'] - counts_data['sales']
counts_data = counts_data.loc[counts_data['property_type'] == 'Single Family']
name = line_chart_data['name'].values[0]
housing_mix_tweet = create_housing_mix_tweet(name=name, pie_chart=pie_chart)
sfh_sales_tweet = create_sfh_sales_mix_tweet(name=name, line_chart_data=line_chart_data)
sales_volume_pf_tweet = create_sales_vol_pf_tweet(name=name, pf=pf, counts_data=counts_data)

# Example call to the function
demand_vs_pf_chart(
    name, 
    pf, 
    counts_data,
    price_series='price_feed',
    single_family_series='sales',
    all_other_units_series='all_other_sales',
    yaxis1_title='Price per Square Foot ($)',
    yaxis2_title='Total Number of Purchases',
    title=f'{name} Price per Square Foot and Total Number of Sales by Property Type',
    save_path=f'../graphics/housing_mix_analysis/{name}_purchases_by_pf.png'
)

In [None]:
# Example usage
# Assuming you have a DataFrame named 'chart' with columns 'parcl_id', 'property_type', 'date', 'pct_sales', and 'name'
for pid in chart['parcl_id'].unique():
    pf = feeds.loc[feeds['parcl_id'] == pid]
    pie_chart = df.loc[(df['parcl_id']==pid) & (df['date']=='2024-04-01')]
    line_chart_data = chart.loc[chart['parcl_id'] == pid]
    counts_data = line_chart_data.copy(deep=True)
    counts_data['all_other_sales'] = counts_data['all_sales'] - counts_data['sales']
    counts_data = counts_data.loc[counts_data['property_type'] == 'Single Family']
    name = line_chart_data['name'].values[0]
    housing_mix_tweet = create_housing_mix_tweet(name=name, pie_chart=pie_chart)
    sfh_sales_tweet = create_sfh_sales_mix_tweet(name=name, line_chart_data=line_chart_data)
    sales_volume_pf_tweet = create_sales_vol_pf_tweet(name=name, pf=pf, counts_data=counts_data)
    
    

    print(housing_mix_tweet)
    build_pie_chart(
        data=pie_chart,
        market_name=name,
        save_path=f'../graphics/housing_mix_analysis/{name}_housing_stock_mix.png'
    )
    print(sfh_sales_tweet)
    build_line_chart(
        data=chart.loc[chart['parcl_id']==pid],
        title=f'{name} Sales Mix: Single Family vs. All Other Units',
        x_axis_title='',
        y_axis_title='Percentage of Total Sales',
        value_name='pct_sales',
        save_path=f'../graphics/housing_mix_analysis/{name}_home_sales_mix.png'
)
    print(sales_volume_pf_tweet)
    # Example call to the function
    demand_vs_pf_chart(
        name, 
        pf, 
        counts_data,
        price_series='price_feed',
        single_family_series='sales',
        all_other_units_series='all_other_sales',
        yaxis1_title='Price per Square Foot ($)',
        yaxis2_title='Total Number of Purchases',
        title=f'{name} Price per Square Foot and Total Number of Sales by Property Type',
        save_path=f'../graphics/housing_mix_analysis/{name}_purchases_by_pf.png'
    )
    