In [None]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LinearRegression

In [None]:
# Read csv using pandas. We should specify the column index to avoid mislabelling problems when data is uploaded
data=pd.read_csv('data/housingdata.csv',index_col=False)

In [None]:
# data.head to check the first few columns of the data set
data.head()

In [None]:
# We rename the columns now using data.columns function
data.columns=['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','Proportion of Blacks','% lower status',
             'Median Value']

In [None]:
# data.head after renaming the columns
data.head()

In [None]:
# let's drop proportion of blacks
# We use data.drop to drop the proportion of blacks
data.drop("Proportion of Blacks",axis=1, inplace=True)

In [None]:
# data.head to verify
data.head()

In [None]:
# we use describe for sample stats and central tendency stats
data.describe()

In [None]:
# We use data.info to get the data types and count of non-nulls in the dataset
data.info()

In [None]:
# From the result above, we found out that there is a possibility of nulls or missing data which we can count by summing all nulls
# checking for nulls
data.isnull().any()

In [None]:
# Now, let's check for a correlation between the target value, which is our Median House price values, and all the other columns.
# Let's check then for a correlation to our median value (which of the variables are highly correlated to it?)
# We use seaborn heatmap. We should also consider the effects of outliers as well. But, let's check for correlation first
# Using seaborn
plt.figure(figsize=(10, 10))
sns.heatmap(data.corr(),annot = True,cmap= 'coolwarm', linewidths=1, linecolor='white',fmt='.2g')
plt.savefig("output/Seaborn.png")

Looking at the last column, we found out that the # of rooms (0.7) and % lower status (-0.74) have a strong correlation to the median house value. This means they directly influence the price. The correlation analysis shows that median house value is highly correlated to % lower status and the number of rooms per dwelling.
The total number of Rooms is positively correlated to Median Value. So as number of Rooms increases, the Median value increases. The opposite is true for % lower status: when % lower status goes up, the price goes down.

In [None]:
data_dropna = data.copy()
data_dropna = data_dropna.dropna()
#data.info()
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
data.iloc[:,:]=imputer.fit_transform(data)
data.info()

In [None]:
#data.dropna(subset=['Nitric Oxide Concentration', 'Rooms'])
#data.info()

In [None]:
#data = data.fillna(0)
#data.info()

In [None]:
# Assign the data to X and y
X = data[['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','% lower status']]
y = data['Median Value'].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=150)

In [None]:
# Create the model using LinearRegression
model = LinearRegression()

In [None]:
# Fit the model to the training data and calculate the scores for the training and testing data
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
# Plot the Residuals for the Training and Testing data
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")

Seloms Code Here

In [None]:
# Imputate nulls or missing values using the simpleimputer from sklearn and transform them to a median by column 
from sklearn.impute import SimpleImputer
data_copy = data.copy()
imputer = SimpleImputer(strategy='median')
data_copy.iloc[:,:]=imputer.fit_transform(data_copy)

In [None]:
# Checking for nulls or missing values
data.isnull().any()

In [None]:
# Counting nulls or missing values
data.isnull().sum()

Stephs Code Here

# Export Dataframe to CSV

In [None]:
data.to_csv(r'data\revisedhousingdata.csv', index = False)

Rachels Code Here

In [None]:
data.head()
data['Home Value'] = (data['Median Value'] * 1000)
data.head()

In [None]:
# Generate a scatter plot of Charles River Variable(distance from the river) vs. value
crimeRate = data.iloc[:,0]
value = data.iloc[:,13]

plt.scatter(crimeRate, value, color = '#545E45')

#label
plt.title('Crime Rate vs Value')
plt.xlabel('Crime Rate')
plt.ylabel('Value')
plt.show()

In [None]:
# Generate a scatter plot of Charles River Variable(distance from the river) vs. value
charlesRiver = data.iloc[:,3]
value = data.iloc[:,13]

plt.scatter(charlesRiver, value, color = '#8D2B00')

#label
plt.title('Distance from Charles River vs Value')
plt.xlabel('Distance')
plt.ylabel('Value')
plt.show()

In [None]:
# Generate a scatter plot of Nitric Oxide Concentration vs. value
nitricOxide = data.iloc[:,4]
value = data.iloc[:,13]

plt.scatter(nitricOxide, value, color = '#BE6731')

#label
plt.title('Nitric Oxide vs Value')
plt.xlabel('Nitric')
plt.ylabel('Value')
plt.savefig("output/NitricOxideVsValue.png")
plt.show()

In [None]:
# Generate a scatter plot of rooms vs. value
rooms = data.iloc[:,5]
value = data.iloc[:,13]

plt.scatter(rooms, value, color = '#76704C')


#label
plt.title('Number of Rooms vs Value')
plt.xlabel('Rooms')
plt.ylabel('Value')
plt.savefig("output/NumberOfRoomsVsValue.png")
plt.show()

In [None]:
# Generate a scatter plot of age vs. value
age = data.iloc[:,6]
value = data.iloc[:,13]

plt.scatter(age, value, color = '#B55119')

#label
plt.title('Age of Home vs Value')
plt.xlabel('Age')
plt.ylabel('Value')
plt.savefig("output/AgeVsValue.png")
plt.show()

In [None]:
# Generate a scatter plot of % lower status vs. value
lowerStatus = data.iloc[:,11]
value = data.iloc[:,13]

plt.scatter(lowerStatus, value, color = '#1F2D16')

#label
plt.title('% Lower Status vs Value')
plt.xlabel('Status')
plt.ylabel('Value')
plt.savefig("output/StatusVsValue.png")
plt.show()

Matts Code Here

Initial Model Before Standardizing Data

In [None]:
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Assign the data to X and y
X = data[['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','% lower status']]
y = data['Median Value'].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=150)
print(X_train.shape)
print(X_test.shape)

In [None]:
#Creating the model
model = Sequential()
model.add(Dense(36, input_dim=12, activation= "relu"))
model.add(Dense(1, activation='linear'))

In [None]:
model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model.fit(X_train, y_train, epochs=100)

In [None]:
# Make a test preciction to see if we're on the right track
row = 3
test_row = X_test.iloc[row, :]
test_row_array = np.array(test_row).reshape(1, 12)
test_row_array.shape

print(f'Prediction = {model.predict(test_row_array)}')
print(f'Actual = {y_test[row]}')

In [None]:
#Use X_test to predict y, and use that to calculate our R score
y_test_predict = model.predict(X_test)
r2_score(y_test, y_test_predict)

In [None]:
model.evaluate(X_test,y_test, verbose=2)

The R2 score for our model is not very reliable at a .37 which would put it in the higher of the of low reliability range for a model. Next we will standarize our data and then see if that improves our R score.

Model With Standardized Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
model_scaled = Sequential()
model_scaled.add(Dense(36, input_dim=12, activation= "relu"))
model_scaled.add(Dense(1, activation='linear'))

model_scaled.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model_scaled.fit(X_train_scaled, y_train_scaled, epochs=100)

#r2_score(y_train_scaled, y_test_scaled)

In [None]:
#Use X_test to predict y, and use that to calculate our R score
y_test_scaled_predict = model_scaled.predict(X_test_scaled)
r2_score(y_test_scaled, y_test_scaled_predict)

In [None]:
#Evaluate the model
model_scaled.evaluate(X_test_scaled,y_test_scaled, verbose=2)

While the standardized data made a huge impact on improving the performance of our model, we still thought it could be further improved. In order to try making it more accurate we decided to drop some columns that had low correlation to the median household income to see if that would make a difference.

In [None]:
manipulated_data = data.copy()

manipulated_data = manipulated_data.drop(["Charles River Variable", 'Distance'],axis=1)
manipulated_data.head()

In [None]:
X_manipulated = manipulated_data[['Crime Rate','Residential Land Zone','Non-retail business acres','Nitric Oxide Concentration',
             'Rooms','Age','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','% lower status']]
y_manipulated = manipulated_data['Median Value'].values.reshape(-1,1)
#print(X.shape, y.shape)

X_train_manipulated, X_test_manipulated, y_train_manipulated, y_test_manipulated = train_test_split(X_manipulated, y_manipulated, random_state=150)
#print(X_train.shape)
#print(X_test.shape)

In [None]:
X_scaler_manipulated = StandardScaler().fit(X_train_manipulated)
y_scaler_manipulated = StandardScaler().fit(y_train_manipulated)

X_train_scaled_manipulated = X_scaler_manipulated.transform(X_train_manipulated)
X_test_scaled_manipulated = X_scaler_manipulated.transform(X_test_manipulated)
y_train_scaled_manipulated = y_scaler_manipulated.transform(y_train_manipulated)
y_test_scaled_manipulated = y_scaler_manipulated.transform(y_test_manipulated)

model_manipulated = Sequential()
model_manipulated.add(Dense(30, input_dim=10, activation= "relu"))

model_manipulated.add(Dense(1, activation='linear'))

model_manipulated.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model_manipulated.fit(X_train_scaled_manipulated, y_train_scaled_manipulated, epochs=100)

model_manipulated.evaluate(X_test_scaled_manipulated,y_test_scaled_manipulated, verbose=2)

In [None]:
#Use X_test to predict y, and use that to calculate our R score
y_test_scaled_manip_predict = model_manipulated.predict(X_test_scaled_manipulated)
r2_score(y_test_scaled_manipulated, y_test_scaled_manip_predict)

Dropping the two rows did not make any significnat impact on the r score of the model. Next we will try cherry picking the rows with the highest correlation to median value to see if that changes at all.

In [None]:
X_cherrypicked = data[['Non-retail business acres','Nitric Oxide Concentration',
             'Rooms','Tax Rate','Pupil-Teacher ratio','% lower status']]
y_cherrypicked = data['Median Value'].values.reshape(-1,1)
#print(X.shape, y.shape)

X_train_cherrypicked, X_test_cherrypicked, y_train_cherrypicked, y_test_cherrypicked = train_test_split(X_cherrypicked, y_cherrypicked, random_state=150)
#print(X_train.shape)
#print(X_test.shape)

In [None]:
X_scaler_cherrypicked = StandardScaler().fit(X_train_cherrypicked)
y_scaler_cherrypicked = StandardScaler().fit(y_train_cherrypicked)

X_train_scaled_cherrypicked = X_scaler_cherrypicked.transform(X_train_cherrypicked)
X_test_scaled_cherrypicked = X_scaler_cherrypicked.transform(X_test_cherrypicked)
y_train_scaled_cherrypicked = y_scaler_cherrypicked.transform(y_train_cherrypicked)
y_test_scaled_cherrypicked = y_scaler_cherrypicked.transform(y_test_cherrypicked)

model_cherrypicked = Sequential()
model_cherrypicked.add(Dense(18, input_dim=6, activation= "relu"))
model_cherrypicked.add(Dense(1, activation='linear'))

model_cherrypicked.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model_cherrypicked.fit(X_train_scaled_cherrypicked, y_train_scaled_cherrypicked, epochs=100)

model_cherrypicked.evaluate(X_test_scaled_cherrypicked,y_test_scaled_cherrypicked, verbose=2)

In [None]:
#Use X_test to predict y, and use that to calculate our R score
y_test_scaled_cherrypicked_predict = model_cherrypicked.predict(X_test_scaled_cherrypicked)
r2_score(y_test_scaled_cherrypicked, y_test_scaled_cherrypicked_predict)

Cherry picking the rows we wanted to use had a adverse impact on our models r score. Overall manipulating the data further does not seem to be helping to improve the r score of our model.

Now that we have shown manipulating which rows we used in our data did not have a significnat impact, we wanted to compare two methods of cleaning our data to see if one is more reliable than another. The models above used imputation where we replaced the null values in our data with the mean of the column that null value is in. 

Below is a model using a data set where we instead drop the rows contianing nulls.

In [None]:
data_dropna

In [None]:
X_drop = data_dropna[['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','% lower status']]
y_drop = data_dropna['Median Value'].values.reshape(-1,1)

X_train_drop, X_test_drop, y_train_drop, y_test_drop = train_test_split(X_drop, y_drop, random_state=150)

X_scaler_drop = StandardScaler().fit(X_train_drop)
y_scaler_drop = StandardScaler().fit(y_train_drop)

X_train_scaled_drop = X_scaler_drop.transform(X_train_drop)
X_test_scaled_drop = X_scaler_drop.transform(X_test_drop)
y_train_scaled_drop = y_scaler_drop.transform(y_train_drop)
y_test_scaled_drop = y_scaler_drop.transform(y_test_drop)

model_drop = Sequential()
model_drop.add(Dense(36, input_dim=12, activation= "relu"))
model_drop.add(Dense(1, activation='linear'))

model_drop.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model_drop.fit(X_train_scaled_drop, y_train_scaled_drop, epochs=100)

In [None]:
y_test_scaled_predict_drop = model_drop.predict(X_test_scaled_drop)
r2_score(y_test_scaled_drop, y_test_scaled_predict_drop)

Using Drop NA we actually see that the model becomes far more accurate, up to .9 from .75. This further supports avoiding unessecary manpulation of the data. Next we will try removing outliers from our data set to see if that improves our models performance.

In [None]:
from scipy import stats

data_remove_outliers = data_dropna.copy()
data_remove_outliers = data_remove_outliers[(np.abs(stats.zscore(data_remove_outliers)) < 3).all(axis=1)]
data_remove_outliers

In [None]:
X_outlier = data_remove_outliers[['Crime Rate','Residential Land Zone','Non-retail business acres','Charles River Variable','Nitric Oxide Concentration',
             'Rooms','Age','Distance','Accessiblity to Highway','Tax Rate','Pupil-Teacher ratio','% lower status']]
y_outlier = data_remove_outliers['Median Value'].values.reshape(-1,1)

X_train_outlier, X_test_outlier, y_train_outlier, y_test_outlier = train_test_split(X_outlier, y_outlier, random_state=150)

X_scaler_outlier = StandardScaler().fit(X_train_outlier)
y_scaler_outlier = StandardScaler().fit(y_train_outlier)

X_train_scaled_outlier = X_scaler_outlier.transform(X_train_outlier)
X_test_scaled_outlier = X_scaler_outlier.transform(X_test_outlier)
y_train_scaled_outlier = y_scaler_outlier.transform(y_train_outlier)
y_test_scaled_outlier = y_scaler_outlier.transform(y_test_outlier)

model_outlier = Sequential()
model_outlier.add(Dense(36, input_dim=12, activation= "relu"))
model_outlier.add(Dense(1, activation='linear'))

model_outlier.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model_outlier.fit(X_train_scaled_outlier, y_train_scaled_outlier, epochs=100)

In [None]:
y_test_scaled_predict_outlier = model_outlier.predict(X_test_scaled_outlier)
r2_score(y_test_scaled_outlier, y_test_scaled_predict_outlier)


Removing outliers from our data has improved its overall R2 score slightly, which would indicate it is helpful in improving the reliability of the model overall.