In [None]:
#Importing Relevant Packages
##E1 - List the packages or libraries you have chosen for Python or R and justify how each item on the list supports the analysis.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #Used for Data Visualizations
import seaborn as sns #Used for Data Visualizations
import statsmodels.api as sm #Used to create Ordinary Least Squares (OLS) Regression Model
import matplotlib.ticker as mticker #Used to properly scale the axes in dataplots
from sklearn.model_selection import train_test_split #Used to split the datasets
from statsmodels.stats.outliers_influence import variance_inflation_factor #Used to check for multicollinearity in the data models
from sklearn.preprocessing import MinMaxScaler #Used for diagnostic test of VIF, scaling the dataframe to solve for Multicollinearity
from sklearn.metrics import mean_squared_error #For calculating MSE

In [None]:
''' Part 1: Importing file and cleaning the dataset '''

In [None]:
#Loading the dataset and making a copy to retain original dataset separately
file_path = 'C:/Users/bconn/OneDrive/Documents/WGUCoursework/7 - D600 - Statistical Data Mining/housing_information_dataset.csv'
data = pd.read_csv(file_path)
df = data.copy()

In [None]:
#Performing some initial profiling for reference
df

In [None]:
df.info()
## No null values noted

In [None]:
##All data cleaning steps
#Renaming headers to python conventions
python_headers = ['id', 'price', 'square_footage', 'num_bathrooms', 'num_bedrooms', 'backyard_space', 'crime_rate', 'school_rating', 'age_of_home', 'distance_to_city_center', 'employment_rate', 'property_tax_rate', 'renovation_quality', 'local_amenities', 'transport_access', 'fireplace', 'house_color', 'garage', 'floors', 'windows', 'previous_sale_price', 'is_luxury']
df.columns = python_headers
#Rounding price and previous_sale_price to the nearest dollar and saving as int64
df['price'] = df['price'].round(1).astype('int64')
df['previous_sale_price'] = df['previous_sale_price'].round(1).astype('int64')
#Rounding bathroom to nearest half
def round_to_half(x):
    return round(x *2) / 2
df['num_bathrooms'] = [round_to_half(x) for x in df['num_bathrooms']]
#Changing the negative values for windows and previous_sale_price to their positive values, as negative values are impossible
df['windows'] = df['windows'].abs()
df['previous_sale_price'] = df['previous_sale_price'].abs()
#Remapping all Yes/No values to 1 and 0
bool_map = {"Yes" : 1, "No" : 0}
df['fireplace'] = df['fireplace'].map(bool_map).fillna(df['fireplace'])
df['garage'] = df['garage'].map(bool_map).fillna(df['garage'])

In [None]:
#Confirming data cleaning steps worked
df.info()
## Header format confirmed,fireplace and garage are now boolean values and saved as 1 and 0

In [None]:
''' Part 2: Variable Description and Statistics Visualizations '''
#Utilizing price as the dependant variable, and square_footage, num_bedrooms, crime_rate, fireplace, house_color and garage as independent variables
##C1 - Identify the dependent and all independent variables that are required to answer the research question and justify your selection of variables.

In [None]:
#Variable Descriptions
## C2 - Describe the dependent variable and all independent variables from part C1 using descriptive statistics (counts, means, modes, ranges, min/max), including a screenshot of the descriptive statistics output for each of these variables.
print('Descriptive Statistics\n')
print('\nQuantitative Variables\n')#Showing Mean, Standard Deviation, and 5-number summaries
print('Price')
print(df['price'].describe().apply(
    lambda x: np.format_float_positional(x, precision=2, trim='-')))#Coverting to actual values as scientific notation was more difficult to read
print('\nSquare Footage')
print(df['square_footage'].describe())
print('\nNumber of Bedrooms')
print(df['num_bedrooms'].describe())
print('\nCrime Rate')
print(df['crime_rate'].describe())
print('\n\nQualitative Variables\n')#Showing frequencies in descending order
print(df['fireplace'].value_counts())
print('\n')
print(df['house_color'].value_counts())
print('\n')
print(df['garage'].value_counts())

In [None]:
#Visualizations
## C3 - Generate univariate and bivariate visualizations of the distributions of the dependent and independent variables from part C1, including the dependent variable in the bivariate visualizations.

#Price - Dependant Variable
##Univariate Analysis
plt.figure(figsize = [16,5])
plt.title("Price of Houses")
bins = np.arange(85000, 1050000,50000)
plt.hist(data=df, x='price', bins=bins, edgecolor='black', density=False)
plt.gca().xaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.xlabel("Value of Houses")
plt.ylabel("Frequency");

In [None]:
#Square Footage - Independent Variable
##Univariate and Bivariate Analysis
plt.figure(figsize = [16,5])
plt.suptitle("Exploration of Square Footage of the Household")

#Left Plot: Univariate exploration of square_footage
plt.subplot (1, 2, 1)
plt.title("Square Footage")
bins = np.arange(500, 2900, 125)
plt.hist(data=df, x='square_footage', bins=bins, edgecolor='black', density=False)
plt.xlabel("Square Footage")
plt.ylabel("Frequency")

#Right Plot: Bivariate exploration of square_footage vs price
plt.subplot(1, 2, 2)
plt.title("Square Footage vs. Price of Houses")
sns.regplot(data=df, x="square_footage", y="price",scatter_kws={'alpha' :1/10}, line_kws={'color': 'black'})
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.xlabel("Square Footage")
plt.ylabel("Price");



In [None]:
#Number of Bedrooms - Independent Variable
##Univariate and Bivariate Analysis
plt.figure(figsize = [16,5])
plt.suptitle("Exploration of Number of Bedrooms")

#Left Plot: Univariate exploration of num_bedrooms
plt.subplot(1, 2, 1)
plt.title("Number of Bedrooms")
bins = np.arange(1, 8, 1)
plt.hist(data=df, x='num_bedrooms', bins=bins, edgecolor='black', density=False)
plt.xlabel("Number of Bedrooms")
plt.ylabel("Frequency")

#Right Plot: Bivariate exploration of num_bedrooms vs price
plt.subplot(1, 2, 2)
plt.title("Number of Bedrooms vs. Price of Houses")
sns.violinplot(data=df, x="num_bedrooms", y="price")
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.xlabel("Number of Bedrooms")
plt.ylabel("Price");

In [None]:
#Demonstrating that the horizontal line for 7 bedrooms is due to low response value
df['num_bedrooms'].value_counts()
## 

In [None]:
#Crime Rate - Independent Variable
##Univariate exploration of crime_rate
plt.figure(figsize = [16,5])
plt.title("Crime Rate")
bins = np.arange(0, 100, 5)
plt.hist(data=df, x='crime_rate', bins=bins, edgecolor='black', density=False)
plt.xlabel("Crime Rate")
plt.ylabel("Frequency");

In [None]:
#Crime Rate - Independent Variable
##Bivariate exploration of crime_rate vs price
plt.figure(figsize = [16,5])
plt.title("Crime Rate vs. Price of Houses")
sns.regplot(data=df, x="crime_rate", y="price",scatter_kws={'alpha' :1/10}, line_kws={'color': 'black'})
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.xlabel("Crime Rate")
plt.ylabel("Price");


In [None]:
#Has a Fireplace - Independent Variable
##Univariate and Bivariate Analysis
plt.figure(figsize = [16,5])
plt.suptitle("Exploration of Presence of Fireplace")

#Left Plot: Univariate Analysis of fireplace
plt.subplot(1, 2, 1)
plt.title("Has a Fireplace")
counts = df['fireplace'].value_counts()
categories = ['Yes', 'No']
counts = counts.sort_index()
bars = plt.bar(categories, counts, color=['skyblue', 'salmon'], edgecolor='black')
for bar in bars:
    yval = bar.get_height() 
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.5, 
             f'{int(yval)}', ha='center', va='bottom')
plt.xlabel("Fireplace Present")
plt.ylabel("Count")

#Right Plot: Bivariate Analysis of fireplace vs price
plt.subplot(1, 2, 2)
plt.title("Has a Fireplace")
sns.boxplot(data=df, x="fireplace", y="price")
plt.xticks([0, 1], ['No', 'Yes'])
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.xlabel("Fireplace Present")
plt.ylabel("Price");

In [None]:
#Has a Garage - Independent Variable
##Univariate and Bivariate Analysis
plt.figure(figsize = [16,5])
plt.suptitle("Exploration of Presence of Garage")

#Left Plot: Univariate Analysis of fireplace
plt.subplot(1, 2, 1)
plt.title("Has a Garage")
counts = df['garage'].value_counts()
categories = ['Yes', 'No']
counts = counts.sort_index()
bars = plt.bar(categories, counts, color=['skyblue', 'salmon'], edgecolor='black')
for bar in bars:
    yval = bar.get_height() 
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.5, 
             f'{int(yval)}', ha='center', va='bottom')
plt.xlabel("Garage Present")
plt.ylabel("Count")

#Right Plot: Bivariate Analysis of garage vs price
plt.subplot(1, 2, 2)
plt.title("Has a Garage")
sns.boxplot(data=df, x="garage", y="price")
plt.xticks([0, 1], ['No', 'Yes'])
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.xlabel("Garage Present")
plt.ylabel("Price");

In [None]:
#House Color - Independent Variable
##Univariate Analysis of house_color
plt.figure(figsize=[16,5])
plt.title("House Color")
counts = df['house_color'].value_counts()
categories = ["White", "Yellow", "Blue", "Green", "Red"]
bars = plt.bar(categories, counts, color=["white", "yellow", "blue", "green", "red"], edgecolor='black')
for bar in bars:
    yval = bar.get_height() 
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.5, 
             f'{int(yval)}', ha='center', va='bottom')
plt.xlabel("House Color")
plt.ylabel("Count");

In [None]:
#House Color - Independent Variable
##Bivariate Analysis of house_color vs price

plt.figure(figsize=[16,5])
plt.title("House Color")
sns.boxplot(data=df, x="house_color", y="price", hue="house_color", palette=['blue', 'green', 'red', 'white', 'yellow'],legend=False)
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.xlabel("House Color")
plt.ylabel("Price");


In [None]:
#Performing one-hot encoding
df_encoded = pd.get_dummies(df, columns=['house_color'], drop_first=True)

print(df_encoded)

In [None]:
#Converting one-hot-encoding columns from boolean values to binary integers
one_hot_columns = ['house_color_Green', 'house_color_Red', 'house_color_White', 'house_color_Yellow']
df_encoded[one_hot_columns] = df_encoded[one_hot_columns].astype(int)
print(df_encoded[one_hot_columns].head()) #Confirming the change worked

In [None]:
''' Part 3: Model creation and optimization '''

In [None]:
#Splitting the Dataset into Test and Training
## D1 - Split the data into two datasets, with a larger percentage assigned to the training dataset and a smaller percentage assigned to the test data set. Provide the files.
y = df_encoded.price
X = df_encoded[['square_footage', 'num_bedrooms', 'crime_rate', 'fireplace', 'house_color_Green', 'house_color_Red', 'house_color_White', 'house_color_Yellow', 'garage']].assign(const=1)

#Splitting the Dataset into a Test and Training dataset with an 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.to_frame() #Converting to dataframe for ease in exporting
y_test = y_test.to_frame()
print(f"Training data: {X_train.shape}, Testing data: {X_test.shape}")
print(f"Training labels: {y_train.shape}, Testing labels: {y_test.shape}")



In [None]:
#Combining the datasets for exporting
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)


In [None]:
#Exporting the Test & Train Datasets to share
train_data.to_csv("training_data", index=False)
test_data.to_csv("test_data", index=False)

In [None]:
#Initial Multiple Regression Model
##D2 -  Use the training dataset to create and perform a regression model using regression as a statistical method. Optimize the regression model using a process of your selection. 
##Also checking for Multicollinearity
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
#Checking for multicolinearity using the Variance Inflation Factor (VIF)
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'fireplace', 'house_color_Green', 'house_color_Red', 'house_color_White', 'house_color_Yellow', 'garage']]

vif_df = pd.DataFrame()
vif_df["feature"] = X_train.columns

vif_df["VIF"] = [variance_inflation_factor(X_train.values, i)
                 for i in range (len(X_train.columns))]

print(vif_df)
##No VIF was greater than 6.4

In [None]:
#Testing variables in a correlation matrix to identify strength of correlations
correlation_matrix = X_train.corr()
print(correlation_matrix)

In [None]:
#Initiating Backwards Stepwise Elimination
#Step 1 - Removing garage as it has a p-value of .809
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'fireplace', 'house_color_Green', 'house_color_Red', 'house_color_White', 'house_color_Yellow']].assign(const=1)
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
#Step 2 - Removing house_color_Yellow as it has a p-value of .829
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'fireplace', 'house_color_Green', 'house_color_Red', 'house_color_White']].assign(const=1)
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
#Step 3 - Removing house_color_Red as it has a p-value of .522
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'fireplace', 'house_color_Green', 'house_color_White']].assign(const=1)
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
#Step 4 - Removing house_color_Green as it has a p-value of .319
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'fireplace', 'house_color_White']].assign(const=1)
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
#Step 5 - Removing fireplace as it has a p-value of .326
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'house_color_White']].assign(const=1)
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
#Scaling the training dataset separately to further demonstrate multicollinearity is not a concern. 
#Note - the scaled values will not be used for the final model
scaler = MinMaxScaler()
reg_df_minmax = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)

X = reg_df_minmax[['square_footage', 'num_bedrooms', 'crime_rate', 'house_color_White']].assign(const=1)
model = sm.OLS(y_train, X)
results = model.fit()
print(results.summary())

#Cond. No. has dropped down to 7.37

In [None]:
#Final data model
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'house_color_White']].assign(const=1)
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
''' Part 4: Results and recommendations '''

In [None]:
#Calculating Mean Squared Error for the Training Dataset
##D3 - Give the mean squared error (MSE) of the optimized model used on the training set.
model = sm.OLS(y_train, X_train).fit()
y_pred = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred)
print ("Mean Squared Error (MSE) - Training Dataset:", mse_train)

In [None]:
#Checking Residuals to ensure close to normal distribution
residuals = results.resid
sns.histplot(residuals, kde=True)
plt.title("Residuals Distribution")
plt.show()
##Skewed Right slightly - close to normal distribution

In [None]:
#Running the optimized model on the test data set
##D4 - Run the prediction on the test dataset using the optimized regression model from part D2 to give the accuracy of the prediction model based on the mean squared error (MSE).
'''
X_test = X_test[['square_footage', 'num_bedrooms', 'crime_rate', 'house_color_White']].assign(const=1)
model = sm.OLS(y_test, X_test)
results = model.fit()
print(results.summary())
'''


In [None]:
X_train = X_train[['square_footage', 'num_bedrooms', 'crime_rate', 'house_color_White']].assign(const=1)
final_model = sm.OLS(y_train, X_train).fit() 

In [None]:
# Prepare test data (must match the features used in training)
X_test = X_test[['square_footage', 'num_bedrooms', 'crime_rate', 'house_color_White']].assign(const=1)

# Use the trained model to predict test set values (NO re-training!)
y_pred = final_model.predict(X_test)  # ✅ Use the trained model

# Print the model summary (trained on training data only)
print(final_model.summary())

In [None]:
#Calculating Mean Squared Error for the Test Dataset
model = sm.OLS(y_test, X_test.assign(const=1).fit()
y_pred = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred)
print ("Mean Squared Error (MSE) - Test Dataset:", mse_test)

In [None]:
#Checking Residuals to ensure close to normal distribution
residuals = results.resid
sns.histplot(residuals, kde=True)
plt.title("Residuals Distribution")
plt.show()
##Skewed Right slightly - close to normal distribution

In [None]:
#Comparing both MSE
print ("Mean Squared Error (MSE) - Training Dataset:", mse_train)
print ("Mean Squared Error (MSE) - Test Dataset:", mse_test)

In [None]:
#Validating results by calculating Root Mean Squared Error (RMSE)
training_rmse = np.sqrt(12013060202.521719)
test_rmse = np.sqrt(11681955037.764385)

print("Training RMSE:", training_rmse)
print("Test RMSE:", test_rmse)
print("Mean housing Price:", df["price"].mean())
##The model's predictions are off by a significant amount, so it would be optimal to re-work it and pull in additional variables in order to accurately predict.