In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv("housedata.csv")
df.drop(['SizeRank', 'RegionType', 'RegionName', 'StateName', 'Metro', 'CountyName'], axis=1, inplace=True)
cols = list(df.columns)
#df.dropna(inplace=True)
rows = df.shape[0]

In [None]:
for row in range(rows):
  for idx, col in enumerate(cols):
    if col in ['RegionID', 'State', 'City']:
      continue
    if pd.isna(df.at[row, col]):
      while(pd.isna(df.at[row, col])):
        try:
          df.loc[row, col] = np.nanmin(df.iloc[row, idx+1:].values)
        except:
          df.loc[row, col] = np.nanmax(df.iloc[row, :idx].values)
        print(f'Replaced NA at row {row} and col {idx} with value {df.at[row, col]}')

In [None]:
df.head()

In [None]:
unpivoted_df = df.melt(id_vars=list(df.columns)[0:3], value_vars=list(df.columns)[3:], var_name='Date', value_name='HousePrice')

In [None]:
unpivoted_df.head()

In [None]:
unpivoted_df.to_csv("meted_df.csv")

In [None]:
import pandas as pd
import numpy as np

unpivoted_df = pd.read_csv("meted_df.csv", index_col='Date', parse_dates=True)
unpivoted_df.head()

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose 

interested_zip_code = 91733
# Check for stationarity
arima_df = unpivoted_df.loc[unpivoted_df['RegionID'] == interested_zip_code]
arima_df_houseprices = arima_df[["HousePrice"]]

# Fit a SARIMAX(0, 1, 1)x(2, 1, 1, 12) on the training set 
from statsmodels.tsa.statespace.sarimax import SARIMAX 
  
model = SARIMAX(arima_df['HousePrice'],  
                order = (0, 1, 1),  
                seasonal_order =(2, 1, 1, 12)) 
  
result = model.fit() 
result.summary() 

In [None]:
arima_df_train = arima_df.iloc[:len(arima_df)-12]
arima_df_test = arima_df.iloc[len(arima_df)-12:]

# Fit a SARIMAX(0, 1, 1)x(2, 1, 1, 12) on the training set 
from statsmodels.tsa.statespace.sarimax import SARIMAX 
  
model_test = SARIMAX(arima_df_train['HousePrice'],  
                order = (0, 1, 1),  
                seasonal_order =(2, 1, 1, 12)) 
  
result_test = model_test.fit() 
result_test.summary() 

In [None]:
start = len(arima_df_train) 
end = len(arima_df_train) + len(arima_df_test) - 1
  
# Predictions for one-year against the test set 
predictions_test = result_test.predict(start, end, 
                             typ = 'levels').rename("Predictions") 
  
# plot predictions and actual values 
predictions_test.plot(legend = True) 
arima_df_test['HousePrice'].plot(legend = True) 

In [None]:
months_lookforward = 12

start = len(arima_df)
end = (len(arima_df) - 1) + months_lookforward

predictions = result.predict(start, end)

In [None]:
arima_df['HousePrice'].plot(figsize = (12, 5),legend=True)
predictions.plot(legend=True)

In [None]:
from statsmodels.tsa.stattools import adfuller

# Check for stationarity
result = adfuller(arima_df_houseprices['HousePrice'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])

In [None]:
arima_df_houseprices.head()

In [None]:
arima_df_houseprices.dtypes

In [None]:
# Since the p-value is > 0.05, the data is not stationary. We need to difference it.
arima_df_diff = arima_df_houseprices.diff().dropna()

# Check for stationarity again
result = adfuller(arima_df_diff['HousePrice'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])

In [None]:
import matplotlib.pyplot as plt

# Plot the differenced data
plt.figure(figsize=(10, 5))
plt.plot(arima_df_diff)
plt.title('Differenced Monthly House Prices')
plt.xlabel('Date')
plt.ylabel('House Price')
plt.show()