# MODULE 3: LINEAR REGRESSION FROM SCRATCH

    *************************************************************
    Author:  Adeyemi Adedoyin Simeon
    Program: MSc, Computer Science, University of Ibadan
    Course:  Machine Learning
    Date:    26th May, 2019
    Version: 1.2
    E-mail:  adeyemi.sa1@gmail.com
    *************************************************************
    
    *Note: Please reference the author whenever and wherever you use all/portion of this code*

## IMPLEMENTING THE SIMPLE LINEAR REGRESSION

In [1]:
import numpy as np
import pandas as pd

In [2]:
def my_mean(values):
    return (sum(values) / len(values))

In [3]:
def variance(x_values, x_mean):
    return sum([(x - x_mean)**2 for x in x_values])

In [4]:
def covariance(x_values, y_values, x_mean, y_mean):
    ans = 0.0
    for i in range(len(x_values)):
        ans += ((x_values[i] - x_mean) * (y_values[i] - y_mean))
    
    return ans

In [5]:
# Slope
def coefficientB1(variance_value, covariance_value):
    return (covariance_value / variance_value)

In [6]:
# y_intercept
def coefficientB0(coef_B1_value, x_mean, y_mean):
    return (y_mean - coef_B1_value * x_mean)

In [7]:
def simple_linear_regression(train, test):
    predictions = list()
    x = [row[0] for row in train]
    y = [row[1] for row in train]
    x_mean = my_mean(x)
    y_mean = my_mean(y)
    
    variance_val = variance(x, x_mean)
    co_variance_val = covariance(x, y, x_mean, y_mean)
    
    b1 = coefficientB1(variance_val, co_variance_val)
    b0 = coefficientB0(b1, x_mean, y_mean)
    
    for row in test:
        y_predict = b0 + b1 * row[0]
        predictions.append(y_predict)
    
    return predictions
    

In [8]:
def simple_linear_regression2(x_train, y_train, x_test, y_test):
    predictions = list()
    x, y = x_train, y_train
    x_mean, y_mean = my_mean(x), my_mean(y)
    variance_val = variance(x, x_mean)
    co_variance_val = covariance(x, y, x_mean, y_mean)
    
    b1 = coefficientB1(variance_val, co_variance_val)
    b0 = coefficientB0(b1, x_mean, y_mean)
    
    predictions = [(b0 + b1 * x_val) for x_val in x_test]
    
    return predictions
    

In [9]:
def get_true_values(test):
    return ([row[1] for row in test]) 

In [10]:
def mean_absolute_err(y_true, predictions):
    err = np.array(y_true) - np.array(predictions)
    return sum(np.array(y_true) - np.array(predictions))

In [11]:
def mean_squared_err(y_true, predictions):
    err = np.array(y_true) - np.array(predictions)
    return sum(err**2)

In [12]:
def root_mean_sq_err(y_true, predictions):
    mse = mean_squared_err(y_true, predictions)
    return np.sqrt(mse)

## Importing The Dataset

In [13]:
from sklearn.datasets import load_boston

In [14]:
ds = load_boston()

In [15]:
dataset = pd.DataFrame(data=ds.data,columns=ds.feature_names)

In [16]:
dataset['PRICE'] = ds.target

In [17]:
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### Extracting the First 20 rows of the Bostin Housing dataset

In [18]:
first_twenty = dataset[:20]

In [19]:
first_twenty

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9


## Calculating Mean, variance, Covariance, b1 coeff. and b0-coef. for the first 20 rows of each attribute

In [29]:
# Loop through all column except the last (target variable i.e. first_twenty.columns[-1]) 
# and for each column, calculate the mean, variance, co_var,  and coef.
results = pd.DataFrame(data=[],index=first_twenty.columns[:-1],columns=['Mean','Variance','Covariance','coef_B1','coef_Bo'])

for i,col in enumerate(first_twenty.columns[:-1]):
    x_vals = first_twenty[col]
    y_vals = first_twenty[first_twenty.columns[-1]]
    
    mean_x = my_mean(x_vals)
    mean_y = my_mean(y_vals)
    
    vari = variance(x_vals, mean_x)
    co_var = covariance(x_vals, y_vals, mean_x, mean_y)
    b1 = coefficientB1(vari, co_var)
    b0 = coefficientB0(b1, mean_x, mean_y)
    
    # add results to table
    results.iloc[i] = [mean_x, vari, co_var, b1, b0]
    
    print(i+1,'.) FOR ATTRIBUTE \"'+ col+ '\":\n')
    print('-' * 80)
    print('\nMean = ', mean_x)
    print('\nVariance = ', vari)
    print('\nCovariance = ', co_var)
    print('\nb1 coefficient (x coef) = ', b1)
    print('\nb0 coefficient (y intercept) = ', b0)
    print('\n','-' * 80)
    print('\n\n')
    

1 .) FOR ATTRIBUTE "CRIM":

--------------------------------------------------------------------------------

Mean =  0.32521000000000005

Variance =  2.1624774500000004

Covariance =  -17.823946

b1 coefficient (x coef) =  -8.242373117000595

b0 coefficient (y intercept) =  25.53550216137976

 --------------------------------------------------------------------------------



2 .) FOR ATTRIBUTE "ZN":

--------------------------------------------------------------------------------

Mean =  5.275

Variance =  861.2374999999995

Covariance =  -216.70250000000004

b1 coefficient (x coef) =  -0.2516175851608879

b0 coefficient (y intercept) =  24.18228276172368

 --------------------------------------------------------------------------------



3 .) FOR ATTRIBUTE "INDUS":

--------------------------------------------------------------------------------

Mean =  6.753

Variance =  104.87842

Covariance =  -190.10129999999992

b1 coefficient (x coef) =  -1.81258737498143

b0 coefficient (y

  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
results

Unnamed: 0,Mean,Variance,Covariance,coef_B1,coef_Bo
CRIM,0.32521,2.16248,-17.8239,-8.24237,25.5355
ZN,5.275,861.237,-216.703,-0.251618,24.1823
INDUS,6.753,104.878,-190.101,-1.81259,35.0954
CHAS,0.0,0.0,0.0,,
NOX,0.5142,0.0187652,-2.84372,-151.542,100.778
RM,6.19185,4.40707,45.6231,10.3522,-41.2445
AGE,67.43,7944.74,-992.253,-0.124894,31.2766
DIS,5.22189,16.0009,22.4833,1.40513,15.5176
RAD,3.85,26.55,-73.335,-2.76215,33.4893
TAX,288.6,23586.8,-3304.36,-0.140094,63.286


ValueError: cannot copy sequence with size 0 to array axis with dimension 5

In [27]:
results

Unnamed: 0,Mean,Variance,Covariance,coef_B1,coef_Bo
CRIM,12.0,15.0,12.0,17.0,32.0
ZN,,,,,
INDUS,,,,,
CHAS,,,,,
NOX,,,,,
RM,,,,,
AGE,,,,,
DIS,,,,,
RAD,,,,,
TAX,,,,,
