## DATA CLEANING


In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv('B.csv')

# Handling missing values
data.fillna(method='ffill', inplace=True)

# Standardize date format
data['Date'] = pd.to_datetime(data['Date'])

# Check data structure
print(data.info())

## Correlation Heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Heatmap')
plt.show()

## ALL RELEVANT GRAPHS

In [None]:
# Time Series Plot for HPI, CPI, and GDHI
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], data['HPI'], label='HPI', marker='o')
plt.plot(data['Date'], data['CPI'], label='CPI', marker='x')
plt.plot(data['Date'], data['GDHI per Head'], label='GDHI per Head', marker='s')
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('Trends Over Time')
plt.legend()
plt.grid()
plt.show()

# Separate Graphs for HPI vs Independent Variables
independent_variables = ['CPI', 'GDHI per Head', '95% LTV Rate', '90% LTV Rate', '75% LTV Rate', '60% LTV Rate', '85% LTV Rate']
for variable in independent_variables:
    plt.figure(figsize=(8, 6))
    sns.lineplot(x=data['Date'], y=data['HPI'], color = 'blue', errorbar = None)
    sns.lineplot(x=data['Date'], y=data[variable], color = 'red', markers = '--', errorbar = None)
    plt.title(f'HPI vs {variable}')
    plt.xlabel(variable)
    plt.ylabel('HPI')
    plt.grid()
    plt.show()


## Time Series Line Plot

In [None]:
# Line plot for trends
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], data['HPI'], label='HPI', marker='o')
plt.plot(data['Date'], data['CPI'], label='CPI', linestyle='--')
plt.plot(data['Date'], data['GDHI per Head'], label='GDHI per Head', linestyle='-.')
plt.xlabel('Year')
plt.ylabel('Values')
plt.title('HPI, CPI, and GDHI Trends (2015-2019)')
plt.legend()
plt.grid()
plt.show()

## Pair Plot

In [None]:
sns.pairplot(data[['HPI', 'CPI', 'GDHI per Head', '95% LTV Rate', '90% LTV Rate']])
plt.suptitle('Pair Plot of Key Variables', y=1.02)
plt.show()

## Scatter Plot with Regression Line

In [None]:
from scipy.stats import linregress

# Scatter plot with regression line
plt.figure(figsize=(8, 6))
sns.regplot(x=data['CPI'], y=data['HPI'])
plt.title('HPI vs CPI')
plt.xlabel('CPI')
plt.ylabel('HPI')
plt.grid()
plt.show()


## Saving Cleaned Dataset

In [None]:
# Save cleaned and processed dataset
data.to_csv('cleaned_B.csv', index=False)
print("Cleaned dataset saved!")

## Regression Analysis: Building and Interpreting the Model

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load dataset
data = pd.read_csv('B.csv')

# Define dependent and independent variables
# Adjust column names as per your dataset
X = data[['CPI', 'GDHI per Head', '95% LTV Rate', '90% LTV Rate']]
y = data['HPI']

# Add constant for intercept
X = sm.add_constant(X)

# Build regression model
model = sm.OLS(y, X).fit()

# Print model summary
print(model.summary())


## Prediction Using the Model

In [None]:
# Hypothetical new data for prediction
future_data = pd.DataFrame({
    'CPI': [2.5, 3.0, 3.5],
    'GDHI per Head': [25000, 26000, 27000],
    '95% LTV Rate': [4.5, 4.8, 5.0],
    '90% LTV Rate': [3.8, 4.0, 4.2]
})

# Add constant to future data
future_data = sm.add_constant(future_data)

# Predict HPI
predicted_hpi = model.predict(future_data)
print("Predicted HPI for future scenarios:")
print(predicted_hpi)


## Regression graphs and visualizations

In [None]:
# Scatter plot of Actual vs Predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y, model.fittedvalues, alpha=0.7, label='Predicted HPI')
plt.plot(y, y, color='red', linestyle='--', label='Perfect Fit')  # Reference line
plt.title('Actual vs Predicted HPI')
plt.xlabel('Actual HPI')
plt.ylabel('Predicted HPI')
plt.legend()
plt.grid()
plt.show()


In [None]:
# Extract coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.params
})

# Bar plot of coefficients
plt.figure(figsize=(8, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients, palette='coolwarm')
plt.title('Regression Coefficients')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.grid()
plt.show()


In [None]:
sns.regplot(x=data['CPI'], y=data['HPI'], ci=None, line_kws={'color': 'red'})
plt.title('Regression Line: HPI vs CPI')
plt.xlabel('CPI')
plt.ylabel('HPI')
plt.grid()
plt.show()


In [None]:
import statsmodels.api as sm

# QQ Plot
sm.qqplot(residuals, line='45')
plt.title('QQ Plot of Residuals')
plt.grid()
plt.show()
