1. IMPORTING ALL THE LIBRARIES NECESSARY

In [1]:
import seaborn as sns

In [2]:
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression, LinearRegression

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [6]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, confusion_matrix, roc_curve, roc_auc_score, f1_score, mean_squared_error, r2_score

In [7]:
from matplotlib.colors import ListedColormap

In [8]:
import pandas as pd
#Importing all the .csv files and renaming columns to be more understandable
# Rename columns
new_column_names = ['Date','GDP', 'Inflation','Unemployment', 'Population','Net migration', 'Forest area']

df_HPI= pd.read_csv("CSUSHPISA.csv", parse_dates=['DATE'])
df_eco= pd.read_csv("Economic_indicators.csv", header=None )
df_gov= pd.read_csv("Gov_Spending.csv",parse_dates=['DATE'])

#Data transformation on the Economic_indicators.csv file so that we can operate on it
df_eco= df_eco.transpose()

df_eco= df_eco.iloc[4:]
# Reset index after dropping rows
df_eco.reset_index(drop=True, inplace=True)
df_eco.columns = new_column_names

FileNotFoundError: ignored

In [None]:
df_eco['Year'] = df_eco.loc[:, 'Date']


In [None]:
df_eco.head(20)

In [None]:
df_HPI.tail(20)

In [None]:
df_gov.head(20)

In [None]:
df_gov['DATE'] = pd.to_datetime(df_gov['DATE'], errors='coerce', origin='unix', unit='M').dt.strftime('%Y-%d-%m')
#Converting the date column in the dataset to datetime format (type so that it can be parsed through)

In [None]:
df_gov.tail(20)

In [None]:
df_gov = df_gov.drop([df_gov.index[229], df_gov.index[230],df_gov.index[231],df_gov.index[232],df_gov.index[233],df_gov.index[234],df_gov.index[235],df_gov.index[236],df_gov.index[237],df_gov.index[238],df_gov.index[239],df_gov.index[240]])
#Dropping some rows to match the number of rows of other data

In [None]:
df_gov.tail(20)

In [None]:
df_gov.dtypes

In [None]:
df_gov = df_gov.rename(columns={'TLRESCONS (Millions of Dollars,\nSeasonally Adjusted Annual Rate)' : 'Spent_on_supplies'})
#rename column to a smaller more readable name

In [None]:
df_gov.dtypes

In [None]:
df_eco.dtypes

In [None]:
df_eco.head()

In [None]:
# Convert 'Date' column to datetime format
df_eco['Date'] = pd.to_datetime(df_eco['Date'], errors='coerce', origin='unix', unit='D').dt.strftime('%Y-%d-%m')
df_eco.tail(30)

In [None]:
#Dropping the column as it is difficult to cooreactly parse through
df_eco.drop('Date', axis=1, inplace=True)
df_eco.head()

In [None]:
#Adding date values manually corresponding to the date values in the original dataset

In [None]:
date_values = [pd.Timestamp(year, 1, 1).strftime('%Y-%d-%m') for year in range(2002, 2022)]
#date_values = pd.Series(pd.date_range('20020102', periods=120, freq='M'))
# Create a list of date values with the desired format, updating every month
#date_values = [pd.Timestamp(year, month, 1).strftime('%Y-%m-%d') for year in range(2002, 2023) for month in range(1, 13)]
#df_eco['DATE'] = date_values.to_datetime()

# Print the first few date values
print(date_values[:10])

# Add a new 'Date' column to the DataFrame
df_eco['DATE'] = date_values

# Print the first few rows of the updated DataFrame
print(df_eco.tail(20))


In [None]:
df_eco["DATE"] = pd.to_datetime(df_eco["DATE"], infer_datetime_format=True)

In [None]:
#Converting to datetime format

In [None]:
df_eco.dtypes

In [None]:
df_eco.head()

In [None]:
#df_eco["DATE"] = pd.to_datetime(df_eco["DATE"], infer_datetime_format=True)
economic_indicators = ['GDP', 'Inflation', 'Unemployment', 'Population', 'Net migration', 'Forest area']
df_eco.set_index('DATE', inplace=True)
#Resampling and interpolating data to match the varying frequencies in the other two datasets

In [None]:
df_eco=df_eco.resample("MS").ffill().add_suffix("_ffill")


In [None]:
df_eco.head()

In [None]:
df_eco.tail()

In [None]:
df_eco.info()

In [None]:
df_eco.reset_index(inplace=True)

In [None]:
df_eco.reset_index()['DATE']


In [None]:
df_gov['DATE'] = pd.to_datetime(df_gov['DATE'])
#df_eco['DATE'] = pd.to_datetime(df_eco['DATE'])
df_HPI['DATE'] = pd.to_datetime(df_HPI['DATE'])

In [None]:
df = pd.merge(df_HPI, df_gov, on='DATE', how="outer")

In [None]:
df.head(20)

In [None]:
df = pd.merge(df_eco, df, on="DATE")

In [None]:
df.head(20)

In [None]:
df.isnull().sum()

In [None]:
df["CSUSHPISA"]=df["CSUSHPISA"].fillna(df["CSUSHPISA"].mean())

In [None]:
df.isnull().sum()

In [None]:
df["Net migration_ffill"]=df["Net migration_ffill"].fillna(df["Net migration_ffill"].mean())

In [None]:
df["Forest area_ffill"]=df["Forest area_ffill"].fillna(df["Forest area_ffill"].mean())

In [None]:
df.isnull().sum()

In [None]:
df.head(20)

In [None]:
sns.boxplot(df['GDP_ffill'])

In [None]:
sns.boxplot(df['Inflation_ffill'])

In [None]:
sns.boxplot(df["Unemployment_ffill"])

In [None]:
sns.boxplot(df["Spent_on_supplies"])

In [None]:
sns.boxplot(df["CSUSHPISA"])

In [None]:
plt.figure(figsize =(12,6))
sns.barplot(data=df, x='Year_ffill', y='CSUSHPISA')
plt.xlabel("Year")
plt.ylabel('Housing Prices Index')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize =(12,6))
sns.barplot(data=df, x='Net migration_ffill', y='CSUSHPISA')
plt.xlabel("Net Migration of People")
plt.ylabel('Housing Prices Index')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize =(12,6))
sns.barplot(data=df, x='Year_ffill', y='Spent_on_supplies')
plt.ylabel("Money Spent by gov on house building supplies")
plt.xlabel('Housing Prices Index')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize =(12,6))
sns.barplot(data=df, x='CSUSHPISA', y='Spent_on_supplies')
plt.ylabel("Money Spent by gov on house building supplies")
plt.xlabel('Housing Prices Index')
plt.xticks(rotation=90)
plt.show()

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of All Columns')
plt.show()


In [None]:
sns.pairplot(data = df)

In [None]:
df.drop(['Year_ffill'], axis=1, inplace=True)

In [None]:
df

In [None]:
df['year'] = df['DATE'].dt.year
df['month'] = df['DATE'].dt.month

In [None]:
df['DATE'] = pd.to_numeric(df['DATE'])


In [None]:
#BASE MODEL BUILDING

X = df.drop(['CSUSHPISA'], axis=1)
y = df['CSUSHPISA']

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size= 0.3, random_state= 42)

model= LinearRegression()
model.fit(X_train, y_train )

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Visualize the predicted vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual HPI')
plt.ylabel('Predicted HPI')
plt.title('Actual vs. Predicted HPI')
plt.show()

In [None]:
from sklearn.linear_model import Lasso
model = Lasso()
model.fit(X_train, y_train )

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Visualize the predicted vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual HPI')
plt.ylabel('Predicted HPI')
plt.title('Actual vs. Predicted HPI')
plt.show()

In [None]:

model= RandomForestRegressor()
model.fit(X_train, y_train )

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Visualize the predicted vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual HPI')
plt.ylabel('Predicted HPI')
plt.title('Actual vs. Predicted HPI')
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train, y_train )

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Visualize the predicted vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual HPI')
plt.ylabel('Predicted HPI')
plt.title('Actual vs. Predicted HPI')
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(X_train, y_train )

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Visualize the predicted vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual HPI')
plt.ylabel('Predicted HPI')
plt.title('Actual vs. Predicted HPI')
plt.show()