In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import geopandas as gpd
from linearmodels import PanelOLS
from linearmodels import RandomEffects

# Start from here!

In [4]:
#### read data
pgim_all = pd.read_csv('pgim_erent_all.csv')
pgim_all['y_dt'] = pd.to_datetime(pgim_all['y_dt'])

In [5]:
# read cpi data
cpi = pd.read_csv('cpi.csv')
cpi.drop(columns=['Series ID','Period','Year'],inplace=True)
cpi['y_dt']= pd.to_datetime(cpi['Label'])
cpi.drop(columns=['Label'],inplace=True)
cpi.rename(columns={'Value':'CPI'},inplace=True)

In [6]:
cpi

Unnamed: 0,CPI,y_dt
0,169.300,2000-01-01
1,170.000,2000-02-01
2,171.000,2000-03-01
3,170.900,2000-04-01
4,171.200,2000-05-01
...,...,...
274,298.598,2022-11-01
275,298.990,2022-12-01
276,300.536,2023-01-01
277,301.648,2023-02-01


In [7]:
pgim_all = pd.merge(pgim_all,cpi, on = ['y_dt'], how = 'left')

In [8]:
base_quarter = '2000-10-01'

# Calculate CPI index for each quarter
pgim_all['cpi_index'] = pgim_all['CPI'] / pgim_all.loc[pgim_all['y_dt'] == base_quarter, 'CPI'].values[0]

# Calculate deflated rent price for each quarter
pgim_all['deflated e_rent'] = pgim_all['e_rent'] / pgim_all['cpi_index']

In [9]:
pgim_all

Unnamed: 0,Zip Code,y_dt,e_rent,CPI,cpi_index,deflated e_rent
0,30312,2000-10-01,526.0,173.900,1.000000,526.000000
1,30312,2001-10-01,608.0,177.600,1.021277,595.333333
2,30312,2002-01-01,613.0,177.700,1.021852,599.891390
3,30312,2002-04-01,619.0,179.300,1.031052,600.357501
4,30312,2002-07-01,653.0,180.000,1.035078,630.870556
...,...,...,...,...,...,...
36139,94925,2021-10-01,4188.0,276.522,1.590121,2633.762232
36140,94925,2022-01-01,4267.0,282.599,1.625066,2625.739298
36141,94925,2022-04-01,4417.0,288.611,1.659638,2661.424201
36142,94925,2022-07-01,4305.0,294.628,1.694238,2540.965217


In [10]:
pgim_all.to_csv('pgim_erent_all_deflated.csv',index=False)

In [11]:
#we only want data from 2011-2019, but for change we want to perserve values after 18 months for 2019
mask = (pgim_all['y_dt'].dt.year >= 2011) & (pgim_all['y_dt'].dt.year <= 2021)
df_filtered = pgim_all.loc[mask]

In [12]:
rent_df = df_filtered.reset_index(drop = True)

In [13]:

rent_df['y_dt'] = pd.to_datetime(rent_df['y_dt'])

# Get base year values for each zip code
base_year_rent = rent_df.loc[rent_df['y_dt'] == '2012-01-01', ['Zip Code', 'e_rent']].set_index('Zip Code')

# Create a dictionary for easier lookup
base_rent_dict = base_year_rent['e_rent'].to_dict()

# Apply base year values to each row
rent_df['base rent'] = rent_df['Zip Code'].map(base_rent_dict)

# Get future date values by shifting the data by 6 quarters (18 months)
rent_df['future date'] = rent_df.groupby('Zip Code')['y_dt'].shift(-6)
rent_df['future value'] = rent_df.groupby('Zip Code')['e_rent'].shift(-6)

# Calculate rent change
rent_df['rent_change'] = (rent_df['future value'] - rent_df['e_rent']) / rent_df['base rent']

# Filter the output to keep only the desired columns
output_columns = [
    'Zip Code', 'y_dt', 'e_rent', 'future date', 'future value', 'base rent', 'rent_change'
]
result = rent_df[output_columns]

# Display the result
result


Unnamed: 0,Zip Code,y_dt,e_rent,future date,future value,base rent,rent_change
0,30312,2011-01-01,1030.0,2012-07-01,1122.0,1092.0,0.084249
1,30312,2011-04-01,1055.0,2012-10-01,1109.0,1092.0,0.049451
2,30312,2011-07-01,1074.0,2013-01-01,1111.0,1092.0,0.033883
3,30312,2011-10-01,1079.0,2013-04-01,1149.0,1092.0,0.064103
4,30312,2012-01-01,1092.0,2013-07-01,1167.0,1092.0,0.068681
...,...,...,...,...,...,...,...
18869,94925,2020-10-01,3869.0,NaT,,1838.0,
18870,94925,2021-01-01,3988.0,NaT,,1838.0,
18871,94925,2021-04-01,4114.0,NaT,,1838.0,
18872,94925,2021-07-01,4063.0,NaT,,1838.0,


In [14]:
new_erent = result[['Zip Code','y_dt','rent_change']]

In [15]:
new_erent.to_csv('pgim_processed_change.csv')

# Population + Housing data from ACS

In [16]:
acs_all = pd.read_csv('pca_interpolated_ratechange.csv')

#acs_all = pd.read_csv('pca_interpolated_ratechange_usa.csv')
acs_all['Zip Code'].nunique()

1928

In [17]:
acs_all[acs_all['Zip Code'] == 75201.0]

Unnamed: 0.1,Unnamed: 0,y_dt,Zip Code,Quarterly PC1,Quarterly PC2,Quarterly PC3,Quarterly PC4,Quarterly PC5,Quarterly PC6,Quarterly PC7,Quarterly PC8,Quarterly PC9,Quarterly PC10
5376,683,2010-01-01,75201.0,0.520444,-0.152602,-0.056947,-0.000138,0.001439,0.000755,-0.017889,-0.000434,-0.003053,0.002532
5377,2611,2010-04-01,75201.0,0.520444,-0.152602,-0.056947,-0.000138,0.001439,0.000755,-0.017889,-0.000434,-0.003053,0.002532
5378,4539,2010-07-01,75201.0,0.520444,-0.152602,-0.056947,-0.000138,0.001439,0.000755,-0.017889,-0.000434,-0.003053,0.002532
5379,6467,2010-10-01,75201.0,0.520444,-0.152602,-0.056947,-0.000138,0.001439,0.000755,-0.017889,-0.000434,-0.003053,0.002532
5380,8395,2011-01-01,75201.0,0.520444,-0.152602,-0.056947,-0.000138,0.001439,0.000755,-0.017889,-0.000434,-0.003053,0.002532
5381,10323,2011-04-01,75201.0,0.495446,-0.152594,-0.053068,-0.000668,0.001356,0.000657,-0.020044,-0.001039,-0.002265,0.002603
5382,12251,2011-07-01,75201.0,0.470447,-0.152586,-0.049188,-0.001197,0.001272,0.000559,-0.022199,-0.001644,-0.001477,0.002674
5383,14179,2011-10-01,75201.0,0.445448,-0.152578,-0.045309,-0.001727,0.001189,0.00046,-0.024353,-0.002248,-0.000689,0.002745
5384,16107,2012-01-01,75201.0,0.420449,-0.15257,-0.04143,-0.002257,0.001105,0.000362,-0.026508,-0.002853,0.0001,0.002815
5385,18035,2012-04-01,75201.0,0.395443,-0.152508,-0.037462,-0.001244,0.000608,0.001066,-0.020199,-0.00484,0.000289,0.002062


In [18]:
# Convert 'y_dt' column to datetime
acs_all['y_dt'] = pd.to_datetime(acs_all['y_dt'])

# Get prior date values by shifting the data by 6 quarters (18 months)
acs_all_shifted = acs_all.groupby(['Zip Code']).shift(6).reset_index()
acs_all_shifted = pd.concat([acs_all[['Zip Code', 'y_dt']], acs_all_shifted], axis=1)

acs_all_shifted = acs_all_shifted.iloc[:, :2].join(acs_all_shifted.iloc[:, 5:])


In [19]:
# acs_all_shifted = acs_all.groupby(['Zip Code']).shift(6).reset_index()
# acs_all_shifted = pd.concat([acs_all[['Zip Code', 'y_dt']], acs_all_shifted], axis=1)
acs_processed = acs_all_shifted
acs_processed

Unnamed: 0,Zip Code,y_dt,Quarterly PC1,Quarterly PC2,Quarterly PC3,Quarterly PC4,Quarterly PC5,Quarterly PC6,Quarterly PC7,Quarterly PC8,Quarterly PC9,Quarterly PC10
0,73949.0,2010-01-01,,,,,,,,,,
1,73949.0,2010-04-01,,,,,,,,,,
2,73949.0,2010-07-01,,,,,,,,,,
3,73949.0,2010-10-01,,,,,,,,,,
4,73949.0,2011-01-01,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
92539,88430.0,2020-10-01,-0.304281,0.761781,-0.014195,-0.000447,0.005826,-0.014791,-0.000728,-0.001162,-0.000259,-0.002840
92540,88430.0,2021-01-01,-0.329199,0.761749,-0.014481,-0.006778,0.005446,-0.018725,-0.000274,-0.001006,-0.000019,-0.002781
92541,88430.0,2021-04-01,-0.354118,0.761717,-0.014767,-0.013108,0.005066,-0.022659,0.000181,-0.000851,0.000220,-0.002723
92542,88430.0,2021-07-01,-0.379036,0.761686,-0.015053,-0.019438,0.004686,-0.026593,0.000635,-0.000696,0.000460,-0.002664


In [24]:
# # Merge the original and shifted dataframe on Zip Code and y_dt columns
# acs_processed = pd.merge(acs_all[['Zip Code', 'y_dt', 'Quarterly PC1', 'Quarterly PC2', 'Quarterly PC3', 'Quarterly PC4', 'Quarterly PC5', 'Quarterly PC6', 'Quarterly PC7', 'Quarterly PC8', 'Quarterly PC9', 'Quarterly PC10']], acs_all_shifted, left_on=['Zip Code', 'y_dt'], right_on=['Zip Code', 'y_dt'])

# # Calculate changes
# for i in range(1, 11):
#     acs_processed[f'Quarterly PC{i} Change (%)'] = (acs_processed[f'Quarterly PC{i}'] - acs_processed[f'prior Quarterly PC{i}']) / acs_processed[f'prior Quarterly PC{i}']

# # Filter the output to keep only the change columns, y_dt, and Zip Code
# output_columns = [
#     'y_dt', 'Zip Code', 'Quarterly PC1 Change (%)', 'Quarterly PC2 Change (%)', 'Quarterly PC3 Change (%)', 'Quarterly PC4 Change (%)', 'Quarterly PC5 Change (%)', 'Quarterly PC6 Change (%)', 'Quarterly PC7 Change (%)', 'Quarterly PC8 Change (%)', 'Quarterly PC9 Change (%)', 'Quarterly PC10 Change (%)'
# ]


# acs_processed = acs_processed[output_columns]

# # Display the result
# acs_processed


In [20]:
acs_processed.isnull().sum()

Zip Code              0
y_dt                  0
Quarterly PC1     11568
Quarterly PC2     11568
Quarterly PC3     11568
Quarterly PC4     11568
Quarterly PC5     11568
Quarterly PC6     11568
Quarterly PC7     11568
Quarterly PC8     11568
Quarterly PC9     11568
Quarterly PC10    11568
dtype: int64

In [21]:
new_erent['Zip Code'].nunique()

464

In [22]:
acs_processed['Zip Code'].nunique()

1928

In [23]:
acs_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92544 entries, 0 to 92543
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Zip Code        92544 non-null  float64       
 1   y_dt            92544 non-null  datetime64[ns]
 2   Quarterly PC1   80976 non-null  float64       
 3   Quarterly PC2   80976 non-null  float64       
 4   Quarterly PC3   80976 non-null  float64       
 5   Quarterly PC4   80976 non-null  float64       
 6   Quarterly PC5   80976 non-null  float64       
 7   Quarterly PC6   80976 non-null  float64       
 8   Quarterly PC7   80976 non-null  float64       
 9   Quarterly PC8   80976 non-null  float64       
 10  Quarterly PC9   80976 non-null  float64       
 11  Quarterly PC10  80976 non-null  float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 8.5 MB


# Merge_data

In [24]:
merge_data = pd.merge(acs_processed,new_erent, on = ['y_dt','Zip Code'], how = 'inner')
merge_data = merge_data.dropna()

In [25]:
merge_data

Unnamed: 0,Zip Code,y_dt,Quarterly PC1,Quarterly PC2,Quarterly PC3,Quarterly PC4,Quarterly PC5,Quarterly PC6,Quarterly PC7,Quarterly PC8,Quarterly PC9,Quarterly PC10,rent_change
2,75001.0,2011-07-01,0.520106,-0.165987,-0.038863,0.001585,0.001468,0.001336,0.003275,-0.015373,-0.006313,0.008917,0.084926
3,75001.0,2011-10-01,0.520106,-0.165987,-0.038863,0.001585,0.001468,0.001336,0.003275,-0.015373,-0.006313,0.008917,0.089172
4,75001.0,2012-01-01,0.520106,-0.165987,-0.038863,0.001585,0.001468,0.001336,0.003275,-0.015373,-0.006313,0.008917,0.132696
5,75001.0,2012-04-01,0.520106,-0.165987,-0.038863,0.001585,0.001468,0.001336,0.003275,-0.015373,-0.006313,0.008917,0.072187
6,75001.0,2012-07-01,0.520106,-0.165987,-0.038863,0.001585,0.001468,0.001336,0.003275,-0.015373,-0.006313,0.008917,0.025478
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5358,76266.0,2019-04-01,-0.153958,-0.078294,-0.004069,-0.012662,-0.007738,-0.024104,0.001284,-0.000713,-0.009697,-0.020957,-0.038259
5359,76266.0,2019-07-01,-0.178773,-0.078373,-0.004216,-0.016680,-0.010453,-0.032217,0.001556,-0.001341,-0.006805,-0.028304,0.005277
5360,76266.0,2019-10-01,-0.203969,-0.078284,-0.004160,-0.012686,-0.007962,-0.021868,0.001476,-0.000827,-0.006706,-0.024057,0.083113
5361,76266.0,2020-01-01,-0.229166,-0.078195,-0.004105,-0.008693,-0.005470,-0.011518,0.001395,-0.000314,-0.006607,-0.019810,0.174142


In [26]:
features= ['Quarterly PC1', 'Quarterly PC2']

In [27]:
X = merge_data[features]
y = merge_data['rent_change']

In [28]:
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            rent_change   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     7.965
Date:                Sat, 24 Feb 2024   Prob (F-statistic):           0.000353
Time:                        14:18:25   Log-Likelihood:                 4576.7
No. Observations:                4172   AIC:                            -9147.
Df Residuals:                    4169   BIC:                            -9128.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.1023      0.009     11.378

In [29]:
merge_data.rename(columns={'y_dt':'year'},inplace=True)
year = pd.Categorical(merge_data.year)
merge_data = merge_data.set_index(['Zip Code', 'year'])
merge_data['year'] = year

In [30]:
exog_vars = features
exog = sm.add_constant(merge_data[exog_vars])
mod_ran= RandomEffects( merge_data['rent_change'], exog).fit()
print(mod_ran)

                        RandomEffects Estimation Summary                        
Dep. Variable:            rent_change   R-squared:                        0.0043
Estimator:              RandomEffects   R-squared (Between):              0.0106
No. Observations:                4172   R-squared (Within):               0.0033
Date:                Sat, Feb 24 2024   R-squared (Overall):              0.0038
Time:                        14:18:36   Log-likelihood                    4747.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      8.9986
Entities:                         117   P-value                           0.0001
Avg Obs:                       35.658   Distribution:                  F(2,4169)
Min Obs:                       18.000                                           
Max Obs:                       36.000   F-statistic (robust):             7.2743
                            

In [31]:
mod1= PanelOLS(merge_data['rent_change'], exog,entity_effects =False, time_effects = False).fit()
mod2= PanelOLS(merge_data['rent_change'], exog,entity_effects =False, time_effects = True).fit()
mod3= PanelOLS(merge_data['rent_change'], exog,entity_effects =True, time_effects = False).fit()
mod4= PanelOLS(merge_data['rent_change'], exog,entity_effects =True, time_effects = True).fit()

print(mod4)

                          PanelOLS Estimation Summary                           
Dep. Variable:            rent_change   R-squared:                        0.0062
Estimator:                   PanelOLS   R-squared (Between):             -209.45
No. Observations:                4172   R-squared (Within):              -4.3574
Date:                Sat, Feb 24 2024   R-squared (Overall):             -25.586
Time:                        14:18:40   Log-likelihood                    5343.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      12.599
Entities:                         117   P-value                           0.0000
Avg Obs:                       35.658   Distribution:                  F(2,4018)
Min Obs:                       18.000                                           
Max Obs:                       36.000   F-statistic (robust):             12.599
                            

In [32]:
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from linearmodels.panel import compare

In [33]:
print(compare({'Pooled OLS': mod1,'Random Effect': mod_ran, 'Fixed Effect 1': mod2, 'Fixed Effect 2': mod3, 'Fixed Effect 3': mod4}, stars = True))

                                             Model Comparison                                            
                             Pooled OLS     Random Effect  Fixed Effect 1  Fixed Effect 2  Fixed Effect 3
---------------------------------------------------------------------------------------------------------
Dep. Variable               rent_change       rent_change     rent_change     rent_change     rent_change
Estimator                      PanelOLS     RandomEffects        PanelOLS        PanelOLS        PanelOLS
No. Observations                   4172              4172            4172            4172            4172
Cov. Est.                    Unadjusted        Unadjusted      Unadjusted      Unadjusted      Unadjusted
R-squared                        0.0038            0.0043          0.0007          0.0078          0.0062
R-Squared (Within)               0.0033            0.0033         -5.0842          0.0078         -4.3574
R-Squared (Between)              0.0107       

In [34]:
##### do hausman test to compare re vs fe
import numpy.linalg as la
from scipy import stats

def hausman(fe, re):
    b = fe.params
    B = re.params
    v_b = fe.cov
    v_B = re.cov
    df = b[np.abs(b) < 1e8].size
    chi2 = np.dot((b - B).T, la.inv(v_b -v_B).dot(b - B))
    pval = stats.chi2.sf(chi2, df)
    return chi2, df, pval


hausman_results = hausman(mod_ran, mod4) 
print('chi-Squared: ' + str(hausman_results[0]))
print('degrees of freedom: ' + str(hausman_results[1]))
print('p-Value: '+ str(hausman_results[2]))


chi-Squared: -25.46586840341226
degrees of freedom: 3
p-Value: 1.0


# Machine Learning Models

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


In [34]:
# zip_codes = ['75052','75217','75228','75211','75243','75040','75043','75150','75227','75115','75149','75061','75007','75216','75080','75104','75062','75006','75060','75044','75154','75220','75019','75050','75063','75051','75042','75231','75206','75224','75081','75248','75038','75229','75232','75214','75041','75208','75238','75234','75204','75089','75181','75241','75088','75212','75254','75180','75082','75048','75230','75159','75205','75146','75240','75219','75218','75039','75225','75253','75116','75134','75237','75137','75235','75236','75249','75001','75233','75201','75203','75215','75209','75223','75244','75054','75210','75182','75125','75295','75346','75388','75207','75396','75141','75226','75369','75202','75336','75353','75172','75364','75386','75251','75247','75382','75045','75246','75363','75270','75339','75354','75239','75261','75398','75390','75037','75245','75258','75286','75310','75323','75387','75334','75340','75343','75344','75123','75138','75357','75380','75011','75015','75014','75017','75016','75030','75047','75046','75049','75053','75085','75083','75099','75106','75185','75187','75221','75222','75242','75250','75260','75263','75262','75265','75264','75267','75266','75275','75283','75277','75285','75284','75303','75313','75312','75320','75315','75326','75342','75356','75355','75358','75360','75359','75367','75372','75371','75374','75373','75378','75376','75381','75389','75391','75393','75392','75395','75394','75397','75059','75002','75035','75098','75071','75070','75080','75287','75033','75025','75074','75034','75023','75072','75093','75013','75024','75069','75075','75189','75078','75252','75082','75048','75009','75409','75407','75094','75442','75454','75173','75166','75424','75097','75026','75086','75121','75164','75301','75370','75379','75485','75067','75068','75056','75007','75287','75033','75034','75028','76227','76210','76262','76226','75077','75078','76092','75010','75036','76209','76208','75022','76201','76052','76177','76205','76266','76247','75057','76207','75065','76249','76258','76272','75008','76259','76299','75027','75029','76202','76204','76203','76206','75126','75159','75160','75142','75143','75182','75114','75161','75158','75147','75157','75118','75098','75087','75189','75032','75088','75132']


In [42]:
# zip_codes = list(map(float,zip_codes))

Preparing data

In [36]:
modelling_data = merge_data

In [37]:
modelling_data = modelling_data.drop(['Quarterly PC3','Quarterly PC4', 'Quarterly PC5', 'Quarterly PC6', 'Quarterly PC7', 'Quarterly PC8', 'Quarterly PC9', 'Quarterly PC10'], axis=1)


In [38]:
modelling_data.drop(['year'],axis=1,inplace=True)

In [39]:
modelling_data.reset_index(inplace=True)

In [40]:
modelling_data

Unnamed: 0,Zip Code,year,Quarterly PC1,Quarterly PC2,rent_change
0,75001.0,2011-07-01,0.520106,-0.165987,0.084926
1,75001.0,2011-10-01,0.520106,-0.165987,0.089172
2,75001.0,2012-01-01,0.520106,-0.165987,0.132696
3,75001.0,2012-04-01,0.520106,-0.165987,0.072187
4,75001.0,2012-07-01,0.520106,-0.165987,0.025478
...,...,...,...,...,...
4167,76266.0,2019-04-01,-0.153958,-0.078294,-0.038259
4168,76266.0,2019-07-01,-0.178773,-0.078373,0.005277
4169,76266.0,2019-10-01,-0.203969,-0.078284,0.083113
4170,76266.0,2020-01-01,-0.229166,-0.078195,0.174142


In [41]:
# Convert the 'year' column to datetime
modelling_data['year'] = pd.to_datetime(modelling_data['year'])

# Perform the train-test split
unique_zip_codes = modelling_data['Zip Code'].unique()
train_zip_codes, test_zip_codes = train_test_split(unique_zip_codes, test_size=0.2, random_state=42)

train_data = modelling_data[modelling_data['Zip Code'].isin(train_zip_codes)]
test_data = modelling_data[modelling_data['Zip Code'].isin(test_zip_codes)]

train_data.to_csv('ml-model-training-data.csv')
test_data.to_csv('ml-model-testing-data.csv')

# Split the features and target variables
X_train = train_data[['Zip Code', 'year', 'Quarterly PC1', 'Quarterly PC2']]
y_train = train_data['rent_change']

X_test = test_data[['Zip Code', 'year', 'Quarterly PC1', 'Quarterly PC2']]
y_test = test_data['rent_change']


In [42]:
X_test

Unnamed: 0,Zip Code,year,Quarterly PC1,Quarterly PC2
0,75001.0,2011-07-01,0.520106,-0.165987
1,75001.0,2011-10-01,0.520106,-0.165987
2,75001.0,2012-01-01,0.520106,-0.165987
3,75001.0,2012-04-01,0.520106,-0.165987
4,75001.0,2012-07-01,0.520106,-0.165987
...,...,...,...,...
4131,76262.0,2019-04-01,-0.154263,-0.078590
4132,76262.0,2019-07-01,-0.179146,-0.078678
4133,76262.0,2019-10-01,-0.204622,-0.078603
4134,76262.0,2020-01-01,-0.230097,-0.078528


Check if any Zip Codes in X_train match Zip Codes in X_test


In [43]:

# Get the unique zip codes in X_train and X_test
train_zip_codes = set(X_train['Zip Code'])
test_zip_codes = set(X_test['Zip Code'])

# Check if any zip code in X_train is also in X_test
common_zip_codes = train_zip_codes.intersection(test_zip_codes)

# Print the common zip codes
if common_zip_codes:
    print("Common zip codes found:")
    for zip_code in common_zip_codes:
        print(zip_code)
else:
    print("No common zip codes found.")


No common zip codes found.


In [43]:
# # Convert the year column to a datetime object
# modelling_data['year'] = pd.to_datetime(modelling_data['year'])

# # Define the cutoff date for the test set
# cutoff_date = pd.to_datetime('2020-04-01')

# # Split the data into training and test sets
# X_train = modelling_data[modelling_data['year'] < cutoff_date].drop('rent_change', axis=1)
# y_train = modelling_data[modelling_data['year'] < cutoff_date]['rent_change']
# X_test = modelling_data[modelling_data['year'] >= cutoff_date].drop('rent_change', axis=1)
# y_test = modelling_data[modelling_data['year'] >= cutoff_date][['rent_change','Zip Code']]

In [44]:
# Convert the year column to a datetime object
X_train['year'] = pd.to_datetime(X_train['year'])

# Create new columns for year and quarter
X_train['year_num'] = X_train['year'].dt.year
X_train['quarter_num'] = X_train['year'].dt.quarter

# Drop the original year column
X_train = X_train.drop('year', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['year'] = pd.to_datetime(X_train['year'])


In [45]:
# Convert the year column to a datetime object
X_test['year'] = pd.to_datetime(X_test['year'])

# Create new columns for year and quarter
X_test['year_num'] = X_test['year'].dt.year
X_test['quarter_num'] = X_test['year'].dt.quarter

# Drop the original year column
X_test = X_test.drop('year', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['year'] = pd.to_datetime(X_test['year'])


# Deep Learning - LSTM

In [46]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

In [47]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [48]:
from keras.models import Sequential
model = Sequential()

In [49]:
# Split the data into train and test sets

# Reshape the input data from (3932, 4) to (3932, 1, 4)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Define the model
model = Sequential()
model.add(LSTM(units=128, return_sequences=True, input_shape=(1, X_train_scaled.shape[2])))
# model.add(Dropout(0.2))
model.add(LSTM(units=128, return_sequences=True))
# model.add(Dropout(0.1))
model.add(LSTM(units=128, return_sequences=False))
# model.add(Dropout(0.1))
# model.add(LSTM(units=50))
# model.add(Dropout(0.1))
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='rmsprop', loss='mse')

# Train the model
model.fit(X_train_scaled, y_train, epochs=400, batch_size=32)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model performance using R-squared
#r2 = r2_score(y_test, y_pred)[:,0,0]
#print("R-squared on test data:", r2)


Epoch 1/400


  super().__init__(**kwargs)


[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0075
Epoch 2/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0068
Epoch 3/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0071
Epoch 4/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0067
Epoch 5/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0072
Epoch 6/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0068
Epoch 7/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0067
Epoch 8/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0066
Epoch 9/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0064
Epoch 10/400
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss

In [53]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model performance using R-squared
r2 = r2_score(y_test, y_pred)

print("R-squared on test data:", r2)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 701us/step
R-squared on test data: 0.1940837778696154


In [54]:
# Make predictions on the training set
y_pred_train = model.predict(X_train_scaled)

# Evaluate the model performance using R-squared on training data
r2_train = r2_score(y_train, y_pred_train)
print("R-squared on training data:", r2_train)

[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step
R-squared on training data: 0.19065389316632497


In [55]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Create a pipeline with a standard scaler and an SVM regressor
svr = make_pipeline(StandardScaler(), SVR(C=1, kernel='rbf'))

# Train the model on the training data
svr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svr.predict(X_test)

# Evaluate the model performance using r2 score
r2 = r2_score(y_test, y_pred)
print("SVM R2 score:", r2)


SVM R2 score: 0.14028904755165184
