### Imports

In [46]:
import pandas as pd
import numpy as np

In [47]:
from sklearn.linear_model import LinearRegression

In [48]:
from sklearn.model_selection import train_test_split

### Data Processing

In [49]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int,
              'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int,
              'view':int}

In [50]:
training_data = pd.read_csv('kc_house_train_data.csv', dtype= dtype_dict)

In [51]:
len(training_data)

17384

In [52]:
testing_data = pd.read_csv('kc_house_test_data.csv', dtype= dtype_dict)

In [53]:
len(testing_data)

4229

### use the closed form solution from lecture to calculate the slope and intercept

In [54]:
def simple_linear_regression(input_feature, output):
    x = np.array(input_feature)
    y = np.array(output)
    denominator = x.dot(x) - x.mean() * x.sum()
    slope = (x.dot(y) - y.mean() * x.sum() )/ denominator
    intercept = (y.mean() * x.dot(x) - x.mean()*x.dot(y))/ denominator
    return (intercept, slope)

In [55]:
input_feature = training_data['sqft_living']
output = training_data['price']

In [56]:
squarefeet_intercept, squarfeet_slope = simple_linear_regression(input_feature, output)

In [57]:
squarfeet_slope

281.9588396303426

In [58]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = slope * input_feature + intercept
    return predicted_output

In [59]:
result = get_regression_predictions(input_feature= 2650 , intercept= squarefeet_intercept, slope= squarfeet_slope )

In [60]:
result

700074.84594751336

### What is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data

In [61]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    x = np.array(input_feature)
    y = np.array(output)
    y_hat = get_regression_predictions(input_feature =x, intercept = intercept, slope = slope)
    RSS = 0.0
    for i in range(len(y)):
        RSS += np.square(y[i]-y_hat[i])
#     rss = (y - y_hat)
#     RSS = (rss**2).sum()
    return RSS

In [62]:
def inverse_regression_predictions(output, intercept, slope):
    new_slope = 1 / slope
    new_intercept = - intercept/ slope
    estimated_input =  new_slope * output + new_intercept
    return estimated_input

In [63]:
RSS = get_residual_sum_of_squares(input_feature=input_feature, output=input_feature,
                                  intercept= squarefeet_intercept, slope= squarfeet_slope)

In [64]:
RSS

6183890961904702.0

In [65]:
RSS/ len(input_feature)

355723134025.81122

In [66]:
inv_predict_sqft=inverse_regression_predictions(output=800000, intercept= squarefeet_intercept, slope= squarfeet_slope)

In [67]:
inv_predict_sqft

3004.3962451522771

In [68]:
np.array([2,3])**2

array([4, 9])