In [None]:
#import libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot

In [None]:
#import algorithms specifically from scikit- learn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [None]:
#import dataset
headers = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv("/content/boston_housing.csv", header = None, delimiter=r"\s+")



In [None]:
#check for missing values
df.isnull().sum().sum()


0

In [None]:
#get rid of text and binary data
df = df.drop([1,3,8], axis = 1)
df.reset_index(drop=True, inplace=True)



In [None]:
df

Unnamed: 0,0,2,4,5,6,7,9,10,11,12,13
0,0.00632,2.31,0.538,6.575,65.2,4.0900,296.0,15.3,396.90,4.98,24.0
1,0.02731,7.07,0.469,6.421,78.9,4.9671,242.0,17.8,396.90,9.14,21.6
2,0.02729,7.07,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,4.03,34.7
3,0.03237,2.18,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,2.94,33.4
4,0.06905,2.18,0.458,7.147,54.2,6.0622,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,11.93,0.573,6.593,69.1,2.4786,273.0,21.0,391.99,9.67,22.4
502,0.04527,11.93,0.573,6.120,76.7,2.2875,273.0,21.0,396.90,9.08,20.6
503,0.06076,11.93,0.573,6.976,91.0,2.1675,273.0,21.0,396.90,5.64,23.9
504,0.10959,11.93,0.573,6.794,89.3,2.3889,273.0,21.0,393.45,6.48,22.0


In [None]:
#split data
x = df.iloc[:,0:11]
y = df[13]
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: 13, Length: 506, dtype: float64

In [None]:
#scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
#split features and labels into traning and testing data (80% of data for training, 20% for testing)
from sklearn.model_selection import train_test_split
#split data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#create model and fit training data through model
model = LinearRegression()
model.fit(x_train,y_train)
y_model = model.predict(x_test)
from sklearn.metrics import mean_squared_error, r2_score
print('Mean Squared Error:', mean_squared_error(y_test, y_model))
print('R^2 Score:', r2_score(y_test, y_model))

Mean Squared Error: 3.6318537361939916e-29
R^2 Score: 1.0


In [None]:
#svm regression model with no hypertuning
model = SVR()
model.fit(x_train,y_train)
#run test values through model
y_model = model.predict(x_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_model))
print('R^2 Score:', r2_score(y_test, y_model))

Mean Squared Error: 10.860459810422343
R^2 Score: 0.8519037291901455


In [None]:
#knn regression model with no hypertuning
model = KNeighborsRegressor(n_neighbors=7)
model.fit(x_train,y_train)
#run test values through model
x_test = x_test.astype("float16")
y_model = model.predict(x_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_model))
print('R^2 Score:', r2_score(y_test, y_model))

Mean Squared Error: 6.271540616246497
R^2 Score: 0.914479516179663


In [None]:
#Random Forest Regressor model with no hypertuning
model = RandomForestRegressor()
model.fit(x_train,y_train)
#run test values through model
y_model = model.predict(x_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_model))
print('R^2 Score:', r2_score(y_test, y_model))

Mean Squared Error: 0.01894799999999979
R^2 Score: 0.9997416197667237


In [None]:
#Gradient Boost model with no hypertuning
model = RandomForestRegressor()
model.fit(x_train,y_train)
#run test values through model
y_model = model.predict(x_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_model))
print('R^2 Score:', r2_score(y_test, y_model))

Mean Squared Error: 0.01874675490196066
R^2 Score: 0.9997443640012275
