In [None]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LinearRegression

In [None]:
# Read csv using pandas. We should specify the column index to avoid mislabelling problems when data is uploaded
data=pd.read_csv('data/housingdata.csv',index_col=False)

In [None]:
# data.head to check the first few columns of the data set
data.head()

In [None]:
# We rename the columns now using data.columns function
data.columns=['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','Proportion of Blacks','% lower status',
             'Median Value']

In [None]:
# data.head after renaming the columns
data.head()

In [None]:
# let's drop proportion of blacks
# We use data.drop to drop the proportion of blacks
data.drop("Proportion of Blacks",axis=1, inplace=True)

In [None]:
# data.head to verify
data.head()

In [None]:
# we use describe for sample stats and central tendency stats
data.describe()

In [None]:
# We use data.info to get the data types and count of non-nulls in the dataset
data.info()

In [None]:
# From the result above, we found out that there is a possibility of nulls or missing data which we can count by summing all nulls
# checking for nulls
data.isnull().any()

In [None]:
# Now, let's check for a correlation between the target value, which is our Median House price values, and all the other columns.
# Let's check then for a correlation to our median value (which of the variables are highly correlated to it?)
# We use seaborn heatmap. We should also consider the effects of outliers as well. But, let's check for correlation first
# Using seaborn
plt.figure(figsize=(10, 10))
sns.heatmap(data.corr(),annot = True,cmap= 'coolwarm', linewidths=1, linecolor='white',fmt='.2g')

Looking at the last column, we found out that the # of rooms (0.7) and % lower status (-0.74) have a strong correlation to the median house value. This means they directly influence the price. The correlation analysis shows that median house value is highly correlated to % lower status and the number of rooms per dwelling.
The total number of Rooms is positively correlated to Median Value. So as number of Rooms increases, the Median value increases. The opposite is true for % lower status: when % lower status goes up, the price goes down.

In [None]:
data = data.dropna()
data.info()

In [None]:
#data.dropna(subset=['Nitric Oxide Concentration', 'Rooms'])
#data.info()

In [None]:
#data = data.fillna(0)
#data.info()

In [None]:
# Assign the data to X and y
X = data[['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','% lower status']]
y = data['Median Value'].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=150)

In [None]:
# Create the model using LinearRegression
model = LinearRegression()

In [None]:
# Fit the model to the training data and calculate the scores for the training and testing data
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
# Plot the Residuals for the Training and Testing data
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")

Seloms Code Here

Stephs Code Here

Rachels Code Here

Matts Code Here

In [None]:
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Assign the data to X and y
X = data[['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','% lower status']]
y = data['Median Value'].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=150)
print(X_train.shape)
print(X_test.shape)

In [None]:
#Creating the model
model = Sequential()
model.add(Dense(36, input_dim=12, activation= "relu"))
#model.add(Dense(36, activation= "relu"))
model.add(Dense(1, activation='linear'))

In [None]:
model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model.fit(X_train, y_train, epochs=20)

In [None]:
#pred_train= model.predict(X_train)
#print(np.sqrt(mean_squared_error(y_train,pred_train)))

#pred= model.predict(X_test)
#print(np.sqrt(mean_squared_error(y_test,pred))) 

In [None]:
model.evaluate(X_test,y_test, verbose=2)

With a MSE of 78 our model is clearly not currently very reliable or accurate. One of the first things that could be causing this is the fluctuations in data that we are currently using. As such we need to standardize our data so that everything is being compared equally. 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
model_scaled = Sequential()
model_scaled.add(Dense(36, input_dim=12, activation= "relu"))
model_scaled.add(Dense(1, activation='linear'))

model_scaled.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model_scaled.fit(X_train, y_train, epochs=20)

model_scaled.evaluate(X_test_scaled,y_test_scaled, verbose=2)