# Linear Regression

In [1]:
import numpy as np

In [2]:
data = np.loadtxt("Datasets/data.csv", delimiter=",")

In [3]:
x = data[:, 0] 
#  we are writing our algorithm for 1D array only
# but the internal algorithm which is written requires 2D array
# but in our case we are just going to write code for 1D array
y = data[:, 1]

In [4]:
print(x.shape)
print(y.shape)

(100,)
(100,)


In [5]:
# spliting into training and testing
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(x, y, test_size=0.3)
# if you want ssize of the test to be something else not 25%
# you can  pass an argument called text_size=value
# value is a franction b/w 0 to 1
# by default the value is 0.25 (i.e. 25%)
print(X_train.shape)
print(Y_train.shape)

(70,)
(70,)


In [6]:
# writing a fit function so that we can pass the training data to the fit function so that the fn. can give us m & c values
def fit(x_train, y_train): 
    numerator = (x_train * y_train).mean() - x_train.mean() * y_train.mean()
    denominator = (x_train**2).mean() - (x_train.mean()**2)
    m = numerator / denominator
    c = y_train.mean() - m * x_train.mean()
    return m, c

In [7]:
# writing prediction function for testing data or training data
def predict(x, m, c):
     return m * x + c
# x can be X_test or Y_test

In [8]:
# writing score function (coeficient of determination)
def score(y_truth, y_predicted):
    u = ((y_truth - y_predicted)**2).sum()
    v = ((y_truth - y_truth.mean())**2).sum()
    return 1 - (u/v)
# y_truth & y_predicted can be y_test_truth & y_test_predicted or y_train_truth & y_train_predicted

In [9]:
# writing cost function
def cost(x, y, m, c):
    return ((y - (m * x + c))**2).mean()

In [10]:
m, c = fit(X_train, Y_train)

In [11]:
# predicting on testing data
y_test_predicted = predict(X_test, m, c)

In [12]:
# predicting on training data
y_train_predicted = predict(X_train, m, c)

In [13]:
print("Test score: ", score(Y_test, y_test_predicted))
print("Train score: ", score(Y_train, y_train_predicted))
print("m, c", m, c)
print("Cost on training data: ", cost(X_train, Y_train, m, c))
# the value m & c that's where the cost function is minimized
print("Cost on training data: ", cost(X_train, Y_train, m+1, c))
# so any variation or change in m & c will lead to higher cost function

Test score:  0.5535606457097704
Train score:  0.611443107430611
m, c 1.3497728143102237 7.289891082083074
Cost on training data:  113.51885767739448
Cost on training data:  2543.3043789381354


## Compairing with inbuild Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression
algorithm = LinearRegression()
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
algorithm.fit(X_train, Y_train)

In [15]:
print("Test score: ", algorithm.score(X_test, Y_test))
print("Train score: ", algorithm.score(X_train, Y_train))
print("m, c", algorithm.coef_[0], algorithm.intercept_)
print("Cost on training data: ", cost(X_train, Y_train, m, c))
print("Cost on training data: ", cost(X_train, Y_train, m+1, c))

Test score:  0.5535606457097711
Train score:  0.611443107430611
m, c 1.3497728143102148 7.2898910820835
Cost on training data:  470.79123872437066
Cost on training data:  3165.2675118354914
