In [3]:
# ------ START OF COMMON FILE ANALYSIS CODE -----
# Dependencies
import pandas as pd
from pathlib import Path
import scipy.stats as st
import matplotlib.pyplot as plt
import numpy as np

In [5]:
# Set filepath for file to analyze
# NOTE: Edit this for each new file being run
zillow_csv_path = Path('Zillow_Data/Metro_zori_uc_sfrcondomfr_sm_month.csv')
zillow_metric_name = 'Rent Index'

In [7]:
# Create a data frame containing the data from the CSV
zillow_df = pd.read_csv(zillow_csv_path)
zillow_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,...,2023-09-30,2023-10-31,2023-11-30,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30
0,102001,0,United States,country,,1253.449061,1260.313718,1269.579976,1279.021194,1288.644933,...,2005.728434,2004.533106,2000.353386,1997.393964,1999.895957,2007.546945,2018.360685,2030.249059,2042.463752,2053.69292
1,394913,1,"New York, NY",msa,NY,2419.800397,2435.544929,2454.626643,2476.503269,2492.789838,...,3381.793473,3365.593176,3341.65675,3322.333169,3325.32804,3343.128943,3376.059213,3406.35257,3442.299156,3472.415449
2,753899,2,"Los Angeles, CA",msa,CA,1848.680007,1860.783762,1877.225401,1890.842498,1906.330946,...,2929.966694,2927.456029,2913.00062,2903.956011,2906.885602,2918.73467,2935.569918,2945.721034,2961.320732,2975.213293
3,394463,3,"Chicago, IL",msa,IL,1460.223748,1467.029721,1477.360961,1487.142749,1498.090186,...,2035.47526,2028.844172,2021.458598,2023.773775,2032.74636,2046.254882,2059.188684,2080.211095,2099.845487,2118.487003
4,394514,4,"Dallas, TX",msa,TX,1138.255989,1144.206978,1152.548664,1164.799816,1174.934505,...,1829.253585,1821.775857,1815.14872,1806.477735,1801.778337,1797.711117,1801.383652,1806.746317,1815.277434,1822.28815


In [9]:
# Rename labels
zillow_df.rename(columns={'SizeRank': 'Size Rank', 'RegionName': 'Metro Area', 'StateName': 'State' }, inplace=True)
# Remove the RegionType column
zillow_df = zillow_df.drop('RegionType', axis = 1)
# Insert a new blank column to store the Size Segment
zillow_df['Size Segment'] = pd.NA
# Re-order columns so that Size Segment is next to Size Rank
columns = zillow_df.columns.tolist()
columns.remove('Size Segment')
columns.insert(2, 'Size Segment')
zillow_df = zillow_df[columns]
zillow_df.head()

Unnamed: 0,RegionID,Size Rank,Size Segment,Metro Area,State,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,...,2023-09-30,2023-10-31,2023-11-30,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30
0,102001,0,,United States,,1253.449061,1260.313718,1269.579976,1279.021194,1288.644933,...,2005.728434,2004.533106,2000.353386,1997.393964,1999.895957,2007.546945,2018.360685,2030.249059,2042.463752,2053.69292
1,394913,1,,"New York, NY",NY,2419.800397,2435.544929,2454.626643,2476.503269,2492.789838,...,3381.793473,3365.593176,3341.65675,3322.333169,3325.32804,3343.128943,3376.059213,3406.35257,3442.299156,3472.415449
2,753899,2,,"Los Angeles, CA",CA,1848.680007,1860.783762,1877.225401,1890.842498,1906.330946,...,2929.966694,2927.456029,2913.00062,2903.956011,2906.885602,2918.73467,2935.569918,2945.721034,2961.320732,2975.213293
3,394463,3,,"Chicago, IL",IL,1460.223748,1467.029721,1477.360961,1487.142749,1498.090186,...,2035.47526,2028.844172,2021.458598,2023.773775,2032.74636,2046.254882,2059.188684,2080.211095,2099.845487,2118.487003
4,394514,4,,"Dallas, TX",TX,1138.255989,1144.206978,1152.548664,1164.799816,1174.934505,...,1829.253585,1821.775857,1815.14872,1806.477735,1801.778337,1797.711117,1801.383652,1806.746317,1815.277434,1822.28815


In [11]:
# Establish Size Segments as bins
size_segment_bins = [-1, 0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
size_segment_labels = ["National Average", "Top 100", "101 - 200", "201 - 300", "301 - 400", "401 - 500", "501 - 600", "601 - 700", "701 - 800", "801 - 900", "901 - 1000"]

In [13]:
# Insert segment values into Size Segment column
zillow_df['Size Segment'] = pd.cut(zillow_df["Size Rank"], bins=size_segment_bins, labels=size_segment_labels)
zillow_df.head()

Unnamed: 0,RegionID,Size Rank,Size Segment,Metro Area,State,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,...,2023-09-30,2023-10-31,2023-11-30,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30
0,102001,0,National Average,United States,,1253.449061,1260.313718,1269.579976,1279.021194,1288.644933,...,2005.728434,2004.533106,2000.353386,1997.393964,1999.895957,2007.546945,2018.360685,2030.249059,2042.463752,2053.69292
1,394913,1,Top 100,"New York, NY",NY,2419.800397,2435.544929,2454.626643,2476.503269,2492.789838,...,3381.793473,3365.593176,3341.65675,3322.333169,3325.32804,3343.128943,3376.059213,3406.35257,3442.299156,3472.415449
2,753899,2,Top 100,"Los Angeles, CA",CA,1848.680007,1860.783762,1877.225401,1890.842498,1906.330946,...,2929.966694,2927.456029,2913.00062,2903.956011,2906.885602,2918.73467,2935.569918,2945.721034,2961.320732,2975.213293
3,394463,3,Top 100,"Chicago, IL",IL,1460.223748,1467.029721,1477.360961,1487.142749,1498.090186,...,2035.47526,2028.844172,2021.458598,2023.773775,2032.74636,2046.254882,2059.188684,2080.211095,2099.845487,2118.487003
4,394514,4,Top 100,"Dallas, TX",TX,1138.255989,1144.206978,1152.548664,1164.799816,1174.934505,...,1829.253585,1821.775857,1815.14872,1806.477735,1801.778337,1797.711117,1801.383652,1806.746317,1815.277434,1822.28815


In [91]:
# Reshape the DataFrame from wide to long format
zillow_df_long = zillow_df.melt(id_vars=['RegionID', 'Size Rank', 'Size Segment', 'Metro Area', 'State'], 
                                var_name='Date', 
                                value_name=zillow_metric_name)
# Convert Date column to datetime format
zillow_df_long['Date'] = pd.to_datetime(zillow_df_long['Date'], format='%Y-%m-%d')

# Set Date as index
zillow_df_long.set_index('Date', inplace=True)

# Ensure data is sorted by Date
zillow_df_long.sort_index(inplace=True)


# Debugging: Print the first few rows to verify the reshaping and date conversion
print("Data after reshaping and date conversion:")
print(zillow_df_long.head())

# Ensure there are no NaN values in the metric column before calculating percent change
zillow_df_long[zillow_metric_name] = zillow_df_long[zillow_metric_name].ffill()

# Calculate month-to-month percent change for the chosen metric
zillow_df_long[f'{zillow_metric_name} Percent Change'] = zillow_df_long.groupby(['RegionID', 'Size Rank', 'Size Segment', 'Metro Area', 'State'], observed=True)[zillow_metric_name].pct_change() * 100 
# Create a unique DataFrame named after the metric being analyzed
zillow_unique_df_name = zillow_metric_name.replace(" ", "_").lower() + "_df"
globals()[zillow_unique_df_name] = zillow_df_long.copy()
print(f'Created new dataframe: {zillow_unique_df_name}')

# Display the updated DataFrame
print(zillow_df_long.head(2000))
      


Data after reshaping and date conversion:
            RegionID  Size Rank      Size Segment       Metro Area State  \
Date                                                                       
2015-01-31    102001          0  National Average    United States   NaN   
2015-01-31    394913          1           Top 100     New York, NY    NY   
2015-01-31    753899          2           Top 100  Los Angeles, CA    CA   
2015-01-31    394463          3           Top 100      Chicago, IL    IL   
2015-01-31    394514          4           Top 100       Dallas, TX    TX   

             Rent Index  
Date                     
2015-01-31  1253.449061  
2015-01-31  2419.800397  
2015-01-31  1848.680007  
2015-01-31  1460.223748  
2015-01-31  1138.255989  
Created new dataframe: rent_index_df
            RegionID  Size Rank      Size Segment       Metro Area State  \
Date                                                                       
2015-01-31    102001          0  National Average    U

In [127]:

# Reshape the DataFrame from wide to long format
zillow_df_long = zillow_df.melt(id_vars=['RegionID', 'Size Rank', 'Size Segment', 'Metro Area', 'State'], 
                                var_name='Date', 
                                value_name=zillow_metric_name)

# Convert Date column to datetime format
zillow_df_long['Date'] = pd.to_datetime(zillow_df_long['Date'], format='%Y-%m-%d')

# Set Date as index
zillow_df_long.set_index('Date', inplace=True)

# Ensure data is sorted by Date
zillow_df_long.sort_index(inplace=True)

# Debugging: Print the first few rows to verify the reshaping and date conversion
print("Data after reshaping and date conversion:")
print(zillow_df_long.head())

# Ensure there are no NaN values in the metric column before calculating percent change
zillow_df_long[zillow_metric_name] = zillow_df_long[zillow_metric_name].ffill()

# Extract Year and Month from the Date
zillow_df_long['Year'] = zillow_df_long.index.year
zillow_df_long['Month'] = zillow_df_long.index.month

# Ensure the Date column is set as index again after adding Year and Month
zillow_df_long.set_index(['RegionID', 'Size Rank', 'Size Segment', 'Metro Area', 'State', 'Year'], inplace=True)

# Create a DataFrame to calculate yearly percent change
def calculate_yearly_percent_change(df):
    # Get the rent index values for January and December
    january_values = df[df['Month'] == 1].groupby('Year')[zillow_metric_name].first()
    december_values = df[df['Month'] == 12].groupby('Year')[zillow_metric_name].last()
    
    # Merge January and December values into a single DataFrame
    year_df = pd.DataFrame({
        'Rent Index_Jan': january_values,
        'Rent Index_Dec': december_values
    }).dropna()
    
    # Calculate the percent change
    year_df['Yearly Percent Change'] = ((year_df['Rent Index_Dec'] - year_df['Rent Index_Jan']) / year_df['Rent Index_Jan']) * 100
    
    # Reset index to include Year
    year_df.reset_index(inplace=True)
    return year_df

# Apply the function to each region and metric combination
yearly_percent_changes = zillow_df_long.groupby(['RegionID', 'Size Rank', 'Size Segment', 'Metro Area', 'State'], observed=True).apply(calculate_yearly_percent_change).reset_index(drop=True)

# Create a unique DataFrame named after the metric being analyzed
zillow_unique_df_name = zillow_metric_name.replace(" ", "_").lower() + "_yearly_percent_change_df"
globals()[zillow_unique_df_name] = yearly_percent_changes

print(f'Created new dataframe: {zillow_unique_df_name}')

# Display the updated DataFrame
print(yearly_percent_changes.head())


Data after reshaping and date conversion:
            RegionID  Size Rank      Size Segment       Metro Area State  \
Date                                                                       
2015-01-31    102001          0  National Average    United States   NaN   
2015-01-31    394913          1           Top 100     New York, NY    NY   
2015-01-31    753899          2           Top 100  Los Angeles, CA    CA   
2015-01-31    394463          3           Top 100      Chicago, IL    IL   
2015-01-31    394514          4           Top 100       Dallas, TX    TX   

             Rent Index  
Date                     
2015-01-31  1253.449061  
2015-01-31  2419.800397  
2015-01-31  1848.680007  
2015-01-31  1460.223748  
2015-01-31  1138.255989  
Created new dataframe: rent_index_yearly_percent_change_df
   Year  Rent Index_Jan  Rent Index_Dec  Yearly Percent Change
0  2015      782.469314      793.784167               1.446044
1  2016      790.516377      807.796239               2.18

In [118]:
# Create a unique data frame which is named after the metric currently being analyzed:
# Set the df name as a string equal to the zillow metric name, with spaces replaced with underscores and in all lowercase
zillow_unique_df_name = zillow_metric_name.replace(" ", "_").lower() + "_df"
globals()[zillow_unique_df_name] = zillow_df.copy()
print(f'Created new dataframe: {zillow_unique_df_name}')


Created new dataframe: rent_index_df
