In [None]:
import numpy as np
import pandas as pd
import sklearn
from math import sqrt

In [None]:
from sklearn.datasets import load_boston
var = load_boston()

In [None]:
print(var.keys())
var.DESCR

In [None]:
df = pd.DataFrame(var.data, columns = var.feature_names)
df

In [None]:
df.describe()

In [None]:
df['MEDV'] = var.target

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
# Boxplot using seaborn
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.figure(figsize = (16, 10))
bp = sns.boxplot(data = df, width = 0.5, saturation = 1, linewidth = 2)
bp

In [None]:
# Heatmap using seaborn
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.figure(figsize = (16, 10))
sns.heatmap(df.corr(), cmap = 'Blues', annot = True, linewidth = 0.5)
plt.show()

In [None]:
# KDE plot using seaborn
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.figure(figsize = (16, 5))
sns.kdeplot(data = df['TAX'])
sns.kdeplot(data = df['AGE'])
sns.kdeplot(data = df['MEDV'])
plt.show()

In [None]:
# Data normalization (z-score)

dfZ = df.copy()

for col in dfZ.columns:
  dfZ[col] = ((dfZ[col] - dfZ[col].mean()) / dfZ[col].std())

dfZ

In [None]:
# Data normalization (min-max)

dfmin_max = df.copy()

for col in dfmin_max.columns:
  dfmin_max[col] = ((dfmin_max[col] - dfmin_max[col].min()) / (dfmin_max[col].max() - dfmin_max[col].min()))
dfmin_max

In [None]:
# bar graph on normalized data

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.figure(figsize = (16, 5))

sns.barplot(data = dfmin_max)
#sns.barplot(data = dfZ)
#sns.barplot(data = df)
plt.show()

In [None]:
# Heatmap on normalized data
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.figure(figsize = (16, 10))
sns.heatmap(dfmin_max.corr(), cmap = 'Blues', annot = True, linewidth = 0.5)
plt.show()

In [None]:
# linear regression on dataframe

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


x = df[['NOX', 'RM', 'DIS', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']]
y = df['MEDV']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.33, random_state = 5)

model = LinearRegression()
model.fit(xTrain, yTrain)
yPred = model.predict(xTest)
pred = pd.DataFrame({'actual': yTest, 'predicted': yPred})
print(pred)

print("mean absolute error: ", mean_absolute_error(yTest, yPred))
print("mean squared error: ", mean_squared_error(yTest, yPred))
print("RMS: ", sqrt(mean_squared_error(yTest, yPred)))
print("R2 score: ", r2_score(yTest, yPred))

In [None]:
# SVM Regressor on normalized data

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


xNorm1 = dfmin_max[['NOX', 'RM', 'DIS', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']]
yNorm1 = dfmin_max['MEDV']
xTrainNorm1, xTestNorm1, yTrainNorm1, yTestNorm1 = train_test_split(xNorm1, yNorm1, test_size = 0.33, random_state = 5)

model = SVR()
model.fit(xTrainNorm1, yTrainNorm1)
yPredNorm1 = model.predict(xTestNorm1)
predNorm1 = pd.DataFrame({'actual': yTestNorm1, 'predicted': yPredNorm1})
print(predNorm1)

print("mean absolute error: ", mean_absolute_error(yTestNorm1, yPredNorm1))
print("mean squared error: ", mean_squared_error(yTestNorm1, yPredNorm1))
print("RMS: ", sqrt(mean_squared_error(yTestNorm1, yPredNorm1)))
print("R2 score: ", r2_score(yTestNorm1, yPredNorm1))

In [None]:
# Random Forest Regressor on normalized data

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


xNorm2 = dfmin_max[['NOX', 'RM', 'DIS', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']]
yNorm2 = dfmin_max['MEDV']
xTrainNorm2, xTestNorm2, yTrainNorm2, yTestNorm2 = train_test_split(xNorm2, yNorm2, test_size = 0.33, random_state = 5)

model = RandomForestRegressor()
model.fit(xTrainNorm2, yTrainNorm2)
yPredNorm2 = model.predict(xTestNorm2)
predNorm2 = pd.DataFrame({'actual': yTestNorm2, 'predicted': yPredNorm2})
print(predNorm2)

print("mean absolute error: ", mean_absolute_error(yTestNorm2, yPredNorm2))
print("mean squared error: ", mean_squared_error(yTestNorm2, yPredNorm2))
print("RMS: ", sqrt(mean_squared_error(yTestNorm2, yPredNorm2)))
print("R2 score: ", r2_score(yTestNorm2, yPredNorm2))

# Gives Highest R2 Score (hence best model)

In [None]:
# plotting values on data (linear regression)

x1 = np.array(xTest['NOX'])
y1 = np.array(yPred)
m, b = np.polyfit(x = x1, y = y1, deg = 1)
print("m: ", m)
print("b: ", b)

plt.figure(figsize = (16, 10))
plt.plot(xTest['NOX'], m * xTest['NOX'] + b)
plt.plot(xTest['NOX'], yPred, 'o')
plt.xlabel("NOX")
plt.ylabel("Median Value")

In [None]:
# plotting values on normalized data (SVM regression)

x1Norm1 = np.array(xTestNorm1['NOX'])
y1Norm1 = np.array(yPredNorm1)
m, b = np.polyfit(x = x1Norm1, y = y1Norm1, deg = 1)
print("m: ", m)
print("b: ", b)

plt.figure(figsize = (16, 10))
plt.plot(xTestNorm1['NOX'], m * xTestNorm1['NOX'] + b)
plt.plot(xTestNorm1['NOX'], yPredNorm1, 'o')
plt.xlabel("NOX")
plt.ylabel("Median Value")

In [None]:
# plotting values on normalized data (Random Forest regression)

x1Norm2 = np.array(xTestNorm2['NOX'])
y1Norm2 = np.array(yPredNorm2)
m, b = np.polyfit(x = x1Norm2, y = y1Norm2, deg = 1)
print("m: ", m)
print("b: ", b)

plt.figure(figsize = (16, 10))
plt.plot(xTestNorm2['NOX'], m * xTestNorm2['NOX'] + b)
plt.plot(xTestNorm2['NOX'], yPredNorm2, 'o')
plt.xlabel("NOX")
plt.ylabel("Median Value")

In [None]:
sns.pairplot(df, x_vars = ['NOX', 'RM', 'DIS', 'AGE', 'TAX', 'PTRATIO', 'LSTAT'], 
             y_vars = 'MEDV', height = 9, aspect = 0.6, kind = 'reg')