### Imports

In [1]:
import pandas as pd
import numpy as np
import gdown
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### Download and read csv

In [2]:
url = "https://drive.google.com/u/0/uc?id=1VUn2WKkKeRXwH02K9bqH98KjPxrUmgXh&export=download"
filename = "HousingData.csv"
gdown.download(url, filename, quiet=False)

Downloading...
From: https://drive.google.com/u/0/uc?id=1VUn2WKkKeRXwH02K9bqH98KjPxrUmgXh&export=download
To: /home/vasilis/projects/ai2-1st-assignement/HousingData.csv
100%|██████████| 1.60M/1.60M [00:00<00:00, 3.20MB/s]


'HousingData.csv'

In [2]:
data = pd.read_csv("HousingData.csv")
data.dropna(inplace = True)

In [3]:
# data['Bedrooms Per Room'] = data['AveRooms'] / data['AveBedrms']
# data['Occup Per Population'] = data['Population'] / data['AveOccup']

y = data[['Median House Value']]
X = data[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']]


In [4]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=42)

### Stochastic Gradient Descent

In [6]:
reg =  SGDRegressor(loss='huber', max_iter=1000)

### Cross Validation Score

In [7]:
%%time

score = cross_val_score(reg , X_train, y_train, scoring = "neg_mean_squared_error", cv = 10, n_jobs=4)

CPU times: user 48.7 ms, sys: 35 ms, total: 83.7 ms
Wall time: 1.91 s


In [8]:
tree_rmse_scores = np.sqrt(-score)
tree_rmse_scores.mean()

0.767722592105251

In [9]:
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

In [10]:
r2_score(y_test, y_pred)

0.5362848219576521

### Batch 

In [11]:
# function to compute predictions created for the current dataset
# where y = w0 + w1*x1 +...+w8*x8 
def predict(X, theta):
  return np.dot(X, theta[0:8]) + theta[8]
  
# function to compute gradient of error 
def gradient(X, y, theta): 
    h = np.dot(X, theta) 
    grad = (2/X.shape[0])*np.dot(X.transpose(), (h - y)) 
    return grad 
  
# function to compute MSE
def cost(X, y, theta): 
    h = np.dot(X, theta) 
    J = np.dot((h - y).transpose(), (h - y)) 
    J /= X.shape[0]
    return J[0] 
  
# function to perform mini-batch gradient descent
def gradientDescent(X, y, learning_rate = 0.1, steps=1000): 
    X = np.c_[ X, np.ones(X.shape[0]) ] #add a column of ones to X for the bias term
    theta = np.zeros((X.shape[1], 1)) #create inital weights w0, w1..wd
    error_list = []  
    for s in range(steps):
      theta = theta - learning_rate * gradient(X, y, theta) 
      error_list.append(cost(X, y, theta)) #useful for plotting changes when using different batch sizes
  
    return theta, error_list 

theta, errors = gradientDescent(X_train, y_train)
# cost_ = cost(X_train, y_train, theta)
y_pred = predict(X_test, theta)
theta



array([[ 8.14286526e-01],
       [ 1.22891973e-01],
       [-2.58237972e-01],
       [ 2.96452200e-01],
       [-8.85567085e-05],
       [-7.33481403e-02],
       [-8.92730259e-01],
       [-8.56770213e-01],
       [ 2.05752441e+00]])

In [12]:
r2_score(y_test,y_pred)


0.594648235237901

In [13]:
mean_squared_error(y_test, y_pred)

0.5436200487640566

### Mini Batch

In [22]:
# function to compute predictions created for the current dataset
# where y = w0 + w1*x1 +...+w8*x8 
def predict(X, theta):
  return np.dot(X, theta[0:8]) + theta[8]
  
# function to compute gradient of error 
def gradient(X, y, theta): 
    h = np.dot(X, theta) 
    grad = (2/X.shape[0])*np.dot(X.transpose(), (h - y)) 
    return grad 
  
# function to compute MSE
def cost(X, y, theta): 
    h = np.dot(X, theta) 
    J = np.dot((h - y).transpose(), (h - y)) 
    J /= X.shape[0]
    return J[0] 

# function to create a list containing mini-batches 
def create_mini_batches(X, y, batch_size): 
    mini_batches = [] 
    data = np.hstack((X, y)) 
    # print(data.shape[0])
    np.random.shuffle(data) 
    n_minibatches = data.shape[0] // batch_size 
    i = 0
  
    for i in range(n_minibatches + 1): 
        mini_batch = data[i * batch_size:(i + 1)*batch_size, :] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
    if data.shape[0] % batch_size != 0: 
        mini_batch = data[i * batch_size:data.shape[0]] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
    return mini_batches    
  
# function to perform mini-batch gradient descent
def gradientDescent(X, y, learning_rate = 0.1, steps=1000, mini_batch_size=12): 
    X = np.c_[ X, np.ones(X.shape[0]) ] #add a column of ones to X for the bias term
    theta = np.zeros((X.shape[1], 1)) #create inital weights w0, w1..wd
    error_list = [] 
    counter = 1 
    for s in range(steps):
      mini_batches = create_mini_batches(X, y, mini_batch_size)  
      # print(counter)
      for mini_batch in mini_batches:
        X_mini, y_mini = mini_batch
        theta = theta - learning_rate * gradient(X_mini, y_mini, theta) 
        error_list.append(cost(X_mini, y_mini, theta)) #useful for plotting changes when using different batch sizes
      counter+=1
    return theta, error_list 

theta, errors = gradientDescent(X_train, y_train)
y_pred = predict(X_test, theta)



In [23]:
r2_score(y_test,y_pred)

-82438.46872992134