### Imports

In [94]:
import pandas as pd
import numpy as np
import gdown
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### Download and read csv

In [95]:
url = "https://drive.google.com/u/0/uc?id=1VUn2WKkKeRXwH02K9bqH98KjPxrUmgXh&export=download"
filename = "HousingData.csv"
gdown.download(url, filename, quiet=False)

Downloading...
From: https://drive.google.com/u/0/uc?id=1VUn2WKkKeRXwH02K9bqH98KjPxrUmgXh&export=download
To: /home/vasilis/projects/ai2-1st-assignement/HousingData.csv
100%|██████████| 1.60M/1.60M [00:00<00:00, 2.22MB/s]


'HousingData.csv'

In [97]:
data = pd.read_csv("HousingData.csv")

In [98]:
# data['Bedrooms Per Room'] = data['AveRooms'] / data['AveBedrms']
# data['Occup Per Population'] = data['Population'] / data['AveOccup']

y = data['Median House Value']
X = data.drop('Median House Value', axis = 1)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=42)


### Batch Gradient Descent

In [100]:
reg = make_pipeline(StandardScaler(), SGDRegressor(loss='huber', penalty='l1', warm_start =  True, max_iter=1000, tol=1e-3))

### Cross Validation Score

In [101]:
%%time

score = cross_val_score(reg , X_train, y_train, scoring = "neg_mean_squared_error", cv = 10, n_jobs=4)

CPU times: user 42.6 ms, sys: 67.3 ms, total: 110 ms
Wall time: 2.87 s


In [102]:
tree_rmse_scores = np.sqrt(-score)
tree_rmse_scores.mean()

1.0604019728812613

In [103]:
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

In [104]:
mean_squared_error(y_test,y_pred,squared=True)

0.8132625259168083

### Stochastic Gradient Descent

In [105]:
# Stochastic Gradient Descent
reg = make_pipeline(StandardScaler(), SGDRegressor(loss='epsilon_insensitive', max_iter=1, penalty='elasticnet', tol=1e-3))

In [106]:
%%time

score = cross_val_score(reg , X_train, y_train, scoring = "neg_mean_squared_error", cv = 10, n_jobs=4)

CPU times: user 65.6 ms, sys: 22.1 ms, total: 87.7 ms
Wall time: 217 ms


In [107]:
tree_rmse_scores = np.sqrt(-score)
tree_rmse_scores.mean()

1.082600478376293

In [108]:
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

In [109]:
mean_squared_error(y_test,y_pred,squared=True)

1.4288836194533434

### Mini Batch 

In [116]:
# function to compute predictions created for the current dataset
# where y = w0 + w1*x1 +...+w8*x8 
def predict(X, theta):
  pred = np.dot(X, theta[0:9]) + theta[9]
  
# function to compute gradient of error 
def gradient(X, y, theta): 
    y = y.to_numpy().reshape((len(y), 1))
    h = np.dot(X, theta) 
    grad = (2/X.shape[0])*np.dot(X.transpose(), (h - y)) 
    return grad 
  
# function to compute MSE
def cost(X, y, theta): 
    y = y.to_numpy().reshape((len(y), 1))
    h = np.dot(X, theta) 
    J = np.dot((h - y).transpose(), (h - y)) 
    J /= X.shape[0]
    return J[0] 
  
# function to perform mini-batch gradient descent
def gradientDescent(X, y, learning_rate = 0.1, steps=10): 
    X = np.c_[ X, np.ones(X.shape[0]) ] #add a column of ones to X for the bias term
    theta = np.zeros((X.shape[1], 1)) #create inital weights w0, w1..wd
    error_list = []  
    for s in range(steps):
      theta = theta - learning_rate * gradient(X, y, theta) 
      error_list.append(cost(X, y, theta)) #useful for plotting changes when using different batch sizes
  
    return theta, error_list 



theta, errors = gradientDescent(X_train, y_train)
# cost_ = cost(X_train, y_train, theta)
predict(X_test, theta)
errors


[array([1.10600692e+15]),
 array([3.82597244e+29]),
 array([1.32354255e+44]),
 array([4.57861349e+58]),
 array([1.58390839e+73]),
 array([5.4793133e+87]),
 array([1.8954931e+102]),
 array([6.55719779e+116]),
 array([2.26837242e+131]),
 array([7.84712254e+145])]