#### CLEANING DATA

In [None]:
import numpy as np # Data cleaning
import matplotlib.pyplot as plt #data plotting
import pandas as pd # Data ingest
from sklearn.preprocessing import MinMaxScaler # Data preprocessing

In [None]:
data = pd.read_csv('VAN.csv', date_parser = True)
data.tail()

In [None]:
data_training = data[data['REF_DATE']<'2020-10']
data_training

In [None]:
data_test = data[data['REF_DATE']<'2020-09'].copy()
data_test

In [None]:
training_data = data_training.drop(['REF_DATE', 'GEO'], axis = 1) #cleaning the data to only include VALUE
training_data.head() #The columns included in final prediction

In [None]:
scaler = MinMaxScaler()
training_data = scaler.fit_transform(training_data)
training_data

In [None]:
X_train = []
y_train = []
training_data.shape[0] #verifying rows with VAN.csv

In [None]:
for i in range (100, training_data.shape[0]): # reads first 60 entries
    X_train.append(training_data[i-100:i])
    y_train.append(training_data[i, 0])

In [None]:
X_train, y_train = np.array(X_train), np.array(y_train)

In [None]:
X_train.shape, y_train.shape

#### BUILDING LSTM

In [None]:
from tensorflow.keras import Sequential # Groups linear data into layers
from tensorflow.keras.layers import Dense, LSTM, Dropout # Neural network layers, linear regression and overfitting stepout

In [None]:
regressior = Sequential() #THIS IS THE LINEAR REGRESSION VARIABLE

regressior.add(LSTM(units = 60, activation = 'relu', return_sequences = True, input_shape = (X_train.shape[1], 1)))
regressior.add(Dropout(0.2))

regressior.add(LSTM(units = 60, activation = 'relu', return_sequences = True))
regressior.add(Dropout(0.2))

regressior.add(LSTM(units = 80, activation = 'relu', return_sequences = True))
regressior.add(Dropout(0.2))

regressior.add(LSTM(units = 120, activation = 'relu'))
regressior.add(Dropout(0.2))

regressior.add(Dense(units = 1))

In [None]:
regressior.summary()

In [None]:
regressior.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
regressior.fit(X_train, y_train, epochs = 50, batch_size = 32)

#### PREPARE TEST DATASET

In [None]:
data_test.head()

In [None]:
data_training.tail(100)

In [None]:
past_100_months = data_training.tail(100)

In [None]:
df = past_100_months.append(data_test, ignore_index = True)
df = df.drop(['REF_DATE', 'GEO'], axis = 1)
df.head()

In [None]:
inputs = scaler.transform(df)
inputs

In [None]:
X_test = []
y_test = []

for i in range (100, inputs.shape[0]):
    X_test.append(inputs[i-100:i])
    y_test.append(inputs[i, 0])

In [None]:
X_test, y_test = np.array(X_test), np.array(y_test)
X_test.shape, y_test.shape

In [None]:
y_pred = regressior.predict(X_test)

In [None]:
y_pred #Not normalized need to do inverse scaling

In [None]:
scaler.scale_

In [None]:
scale = 1/0.02109705 #This will bring it back to our expected value
scale

In [None]:
y_pred = y_pred*scale
y_test = y_test*scale

In [None]:
y_test

#### MY ATTEMPT AT VISUALIZING THE DATA

In [None]:
plt.figure(figsize =(20,7))
plt.plot(y_test, color = 'red', label = "Real VAN Housing Index")
plt.plot(y_pred, color = 'blue', label = "Predicted VAN Housing Index")
plt.title('Housing index prediction (Scaled Down)')
plt.xlabel('Time (months)')
plt.ylabel('Index Value')
plt.legend()
plt.show()