In [1]:
# implementing the linear regression algorithm for a single feature input

In [2]:
# importing the required modules/libraries/packages

import numpy as np
import pandas as pd
from sklearn import model_selection

In [3]:
# loading the data

data=np.loadtxt("data.csv", delimiter=",")

In [4]:
# converting the data to a dataframe 

df=pd.DataFrame(data)

In [5]:
# having a better look at the data

df.head()

Unnamed: 0,0,1
0,32.502345,31.707006
1,53.426804,68.777596
2,61.530358,62.562382
3,47.47564,71.546632
4,59.813208,87.230925


In [6]:
# getting the input and output columns

X=data[:, 0]
Y=data[:, 1]

In [7]:
# splitting the data into training and testing

X_train, X_test, Y_train, Y_test=model_selection.train_test_split(X, Y, random_state=1)

In [8]:
# fitting the training data into the algorithm

def fit(x, y):
    np_x=np.array(x)
    np_y=np.array(y)
    
#     calculating the values of m and c for which the cost function is minimum(obtained after differentiating the cost 
#     function wrt to m and c and putting the differential to 0)
    
    x_mean=np_x.mean()
    y_mean=np_y.mean()
    x2_mean=(np_x**2).mean()
    xy_mean=(np_x*np_y).mean()
    
    m=((xy_mean-(x_mean*y_mean))/(x2_mean-(x_mean**2)))
    c=(y_mean-m*(x_mean))
    
#     returning the optimal values of m and c for the best fit line
    
    return (m, c)

In [10]:
# getting the minimum average total cost or error associated with all the training data points, after finding the optimal values of m and c

def cost(x, y, m, c):
    np_x=np.array(x)
    np_y=np.array(y)

    N=len(X_train)
    
    cost=(((np_y-((m*np_x)+c))**2).sum())/N    
    
    return cost

In [11]:
# predicting the output for a test x input according to the best fit line i.e. y=m*x+c

def predict(x, m, c):
    y_pred=((m*x)+c)
    
    return y_pred

In [12]:
# finding the score for the algorithm by comparing the predicted and the truth values

def score(Y_pred, Y_true):
    np_y_pred=np.array(Y_pred)
    np_y_true=np.array(Y_true)
    
    u=((np_y_true-np_y_pred)**2).sum()
    v=((np_y_true-np_y_true.mean())**2).sum()
    
    score=(1-(u/v))
    
    return score

In [13]:
# getting the optimal values of m and c for the best fit line

m, c=fit(X_train, Y_train)

In [14]:
# finding the minimum average total cost or error which is associated with all the training data points

cost=cost(X_train, Y_train, m, c)

print(cost)

93.62715031893556


In [15]:
# getting the predicted values for the test x input

Y_pred=predict(X_test, m, c)

for prediction in Y_pred:
    print(prediction)

81.77289606029456
82.86481582350953
83.32806576858246
81.73311706338174
68.63768427534484
85.95355347998157
87.92181466135547
53.991289106056016
55.64696334662755
60.42305571451358
96.37827114300786
58.10598640885901
62.66328391049501
61.70532436092045
66.50996680360466
63.96661918310784
55.51122714960584
89.27747740386145
80.42434453751088
67.63904196590036
87.47143059983519
64.79305132691573
73.32613112915536
76.20531030753533
59.51077223498662


In [16]:
# storing the predictions in a csv file

df=pd.DataFrame(Y_pred)

df.to_csv("predictions1.csv", header=False, index=False)

In [17]:
# calculating the score of the algorithm 

score=score(Y_pred, Y_test)

print(score)

0.57882794990919
