# Import Dataset


In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
df = pd.read_csv('/content/drive/MyDrive/FUNIX Progress/MLP302x_1.1-A_EN/data/kc_house_data.csv')

In [3]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 0)

# Simple Linear Regression

## Close Form Approach

### Function Implementation

In [9]:
from scipy import stats

def correlation_approach(input, output):
  pearson_r = stats.pearsonr(input, output)[0]

  std_X = np.std(input)
  std_Y = np.std(output)

  mean_X = input.mean()
  mean_Y = output.mean()

  slope = pearson_r * (std_Y / std_X)
  intercept = mean_Y - slope * mean_X

  return intercept, slope

In [12]:
def sum_approach(input, output):
  N = len(input)

  sum_XY = np.dot(input, output)
  sum_X = sum(input)
  sum_Y = sum(output)

  mean_X = input.mean()
  mean_Y = output.mean()

  sum_square_X = np.dot(input, input)

  slope = (sum_XY - 1/N * sum_X * sum_Y) / (sum_square_X - 1/N * sum_X * sum_X)
  intercept = mean_Y - slope * mean_X

  return intercept, slope

In [7]:
def mean_approach(input, output):
  mean_XY = (input * output).mean()

  mean_X = input.mean()
  mean_Y = output.mean()

  mean_square_X = (input * input).mean()
  
  slope = (mean_XY - mean_X * mean_Y) / (mean_square_X - mean_X * mean_X)
  intercept = mean_Y - slope * mean_X

  return intercept, slope

### Testing Section

In [10]:
mock_feature = np.array(range(5))
mock_output = 1 + 1*mock_feature
(mock_intercept, mock_slope) = correlation_approach(mock_feature, mock_output)
print("Intercept: %.2f" % (mock_intercept))
print("Slope: %.2f" % (mock_slope))

Intercept: 1.00
Slope: 1.00


In [13]:
mock_feature = np.array(range(5))
mock_output = 1 + 1*mock_feature
(mock_intercept, mock_slope) = sum_approach(mock_feature, mock_output)
print("Intercept: %.2f" % (mock_intercept))
print("Slope: %.2f" % (mock_slope))

Intercept: 1.00
Slope: 1.00


In [14]:
mock_feature = np.array(range(5))
mock_output = 1 + 1*mock_feature
(mock_intercept, mock_slope) = mean_approach(mock_feature, mock_output)
print("Intercept: %.2f" % (mock_intercept))
print("Slope: %.2f" % (mock_slope))

Intercept: 1.00
Slope: 1.00


In [15]:
mock_feature = train['sqft_living'].values
mock_output = train['price'].values
(mock_intercept, mock_slope) = mean_approach(mock_feature, mock_output)
print("Intercept: %.2f" % (mock_intercept))
print("Slope: %.2f" % (mock_slope))

Intercept: -48257.06
Slope: 283.97


## Gradient Descent

### Function Implementation

In [16]:
def predict(intercept, slope, input):
  return intercept + slope * input

In [17]:
def residual(prediction, label):
  return prediction - label

In [18]:
def rss_intercept(error):
  return sum(error)

In [19]:
def rss_slope(error, input):
  return np.dot(error, input)

In [20]:
def modulus(vector):
  return np.sqrt(np.dot(vector, vector))

In [21]:
def gradient_descent(input, label, initial_intercept, initial_slope, step_size, tolerance):
  intercept = np.array(initial_intercept)
  slope = np.array(initial_slope)

  converged = False

  while not converged:
    prediction = predict(intercept, slope, input)
    error = residual(prediction, label)

    grad_intercept = rss_intercept(error)
    grad_slope = rss_slope(error, input)

    intercept = intercept - step_size * grad_intercept
    slope = slope - step_size * grad_slope

    if(modulus(np.array([grad_intercept, grad_slope])) < tolerance):
      converged = True

  return intercept, slope

### Testing Section

In [22]:
initial_intercept = 0
initial_slope = 0
step_size = 0.05
tolerance = 0.01

input = np.array(range(5))
output = np.array([1, 3, 7, 13, 21])

intercept, slope = gradient_descent(input, output, initial_intercept, initial_slope, step_size, tolerance)

print(intercept, slope)

-0.9942069818917416 4.997967918970868


In [23]:
initial_intercept = -47000
initial_slope = 1
step_size = 7e-12
tolerance = 2.5e7

input = train['sqft_living'].values
output = train['price'].values

intercept, slope = gradient_descent(input, output, initial_intercept, initial_slope, step_size, tolerance)

print(intercept, slope)

-46999.88700248911 283.4638130669731


# Multiple Regression

## Feature Extraction

In [24]:
def matrix_extraction(data, feature_list, label_list):
  data['constant'] = 1

  feature_list = ['constant'] + feature_list

  feature = data[feature_list].values
  label = data[label_list].values

  return feature, label

## Close Form Approach

### Matrix Notation

In [46]:
def equation_solve(feature, label):
  return np.dot(np.dot(-feature.T, label), np.linalg.inv(np.dot(feature.T, feature)))

### Testing Section

In [36]:
(feature, label) = matrix_extraction(df, ['sqft_living'], 'price')
print(feature[0,:])
print(label[0])
# [   1 1180]
# 221900.0

[   1 1180]
221900.0


In [47]:
equation_solve(example_features, example_output)

array([43580.74309447,  -280.6235679 ])

## Gradient Descent Matrix Form

### Function Implementation

In [55]:
def predict(feature, weight):
  return np.dot(feature, weight)

In [49]:
def residual(prediction, label):
  return prediction - label

In [50]:
def rss_gradient(error, feature):
  return 2 * np.dot(error, feature)

In [56]:
def multiple_gradient_descent(feature, label, weight, step_size, tolerance):
  converged = False

  while not converged:
    prediction = predict(feature, weight)
    error = residual(prediction, label)

    gradient = rss_gradient(error, feature)

    weight = weight - step_size * gradient

    if(modulus(gradient) < tolerance):
      converged = True

  return weight

### Testing Section

In [57]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = matrix_extraction(train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

updated_weights = multiple_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
updated_weights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array([-46999.88720259,    283.46383063])

In [58]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = matrix_extraction(train, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

# Just as the previous section did
weights = multiple_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [59]:
weights

array([-9.99999757e+04,  2.47055837e+02,  6.47974873e+01])

In [63]:
predict(feature_matrix, weights)[0]

402569.24095227406

In [64]:
train['price'].values[0]

495000.0

## Gradient Descent Vector Form

In [67]:
def multiple_gradient_descent(feature, label, weight, step_size, tolerance):
  converged = False

  while not converged:
    prediction = predict(feature, weight)
    error = residual(prediction, label)

    gradient = 0
    for i in range(len(weight)):
      partial = rss_gradient(error, feature[:, i])

      weight[i] = weight[i] - step_size * partial

      gradient = gradient + partial ** 2
    
    if(np.sqrt(gradient) < tolerance):
      converged = True
  
  return weight

In [78]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = matrix_extraction(train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

updated_weights = multiple_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
updated_weights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array([-46999.88720259,    283.46383063])

In [81]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = matrix_extraction(train, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

# Just as the previous section did
weights = multiple_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

weights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array([-9.99999757e+04,  2.47055837e+02,  6.47974873e+01])