# Week 1: Simple Linear Regression 

In [1]:
import pandas as pd
import numpy as np

## Read training and testing data

In [2]:
df_train = pd.read_csv('./kc_house_train_data.csv')
df_test = pd.read_csv('./kc_house_test_data.csv')

In [7]:
sales = df_train.append(df_test)

In [9]:
prices = sales['price']

In [14]:
prices.mean()

540088.1417665294

## Build a generic simple linear regression function

In [22]:
def simple_linear_regression(input_feature, output):
    N = input_feature.size
    sum_x = input_feature.sum()
    sum_y = output.sum()
    sum_xy = np.dot(input_feature, output)
    sum_x2 = np.square(input_feature).sum()
    slope = (sum_xy - (sum_x*sum_y)/N)/(sum_x2 - (sum_x*sum_x)/N)
    intercept = output.mean() - slope*input_feature.mean()
    return (intercept, slope)

In [23]:
sqft_intercept, sqft_slope = simple_linear_regression(df_train['sqft_living'], df_train['price'])

In [24]:
print('intercept:', sqft_intercept)
print('slope:', sqft_slope)

intercept: -47116.07907289418
slope: 281.9588396303426


## Predicting Values

In [25]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_values = input_feature * slope + intercept
    return predicted_values

In [26]:
print(get_regression_predictions(2650, sqft_intercept, sqft_slope))

700074.8459475137


## Residual Sum of Squares

In [27]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    predictions = get_regression_predictions(input_feature, intercept, slope)
    residuals = predictions - output
    return np.square(residuals).sum()

In [28]:
get_residual_sum_of_squares(df_train['sqft_living'], df_train['price'], sqft_intercept, sqft_slope)

1201918354177283.0

## Inverse regression to predict feature

In [29]:
def inverse_regression_predictions(output, intercept, slope):
    return (output-intercept)/slope

In [30]:
inverse_regression_predictions(800000, sqft_intercept, sqft_slope)

3004.3962451522766

## New model: estimate prices from bedrooms

In [31]:
bedrooms_intercept, bedrooms_slope = simple_linear_regression(df_train['bedrooms'], df_train['price'])

## Test models

In [32]:
get_residual_sum_of_squares(df_test['sqft_living'], df_test['price'], sqft_intercept, sqft_slope)

275402933617812.12

In [33]:
get_residual_sum_of_squares(df_test['bedrooms'], df_test['price'], bedrooms_intercept, bedrooms_slope)

493364585960300.9