In [1]:
# implementing the gradient descent algorithm for a single feature input

In [2]:
# importing the required modules/libraries/packages

import numpy as np
import pandas as pd
from sklearn import model_selection

In [3]:
# loading the data 

data=np.loadtxt("data.csv", delimiter=",")

In [4]:
# converting the data to a dataframe

df=pd.DataFrame(data)

In [5]:
# having a better look at the data

df.head()

Unnamed: 0,0,1
0,32.502345,31.707006
1,53.426804,68.777596
2,61.530358,62.562382
3,47.47564,71.546632
4,59.813208,87.230925


In [6]:
# getting the input and output columns

X=data[:, 0]
Y=data[:, 1]

In [7]:
# splitting the data into training and testing

X_train, X_test, Y_train, Y_test=model_selection.train_test_split(X, Y, random_state=1)

In [8]:
# finding the score for the algorithm by comparing the predicted and the truth values

def score(Y_pred, Y_true):
    np_y_pred=np.array(Y_pred)
    np_y_true=np.array(Y_true)
    
    u=((np_y_true-np_y_pred)**2).sum()
    v=((np_y_true-np_y_true.mean())**2).sum()
    
    score=(1-(u/v))
    
    return score

In [9]:
# predicting the output for a test x input according to the best fit line i.e. y=m*x+c

def predict(x, m, c):
    y_pred=((m*x)+c)
    
    return y_pred

In [10]:
# step gradient function which updates the m and c values

def step_gradient(learning_rate, m, c):
    # calculating the differential of cost wrt m and c respectively

    m_slope, c_slope=0, 0
    N=len(X_train)
    
    for i in range(N):
        x=X_train[i]
        y=Y_train[i]
        
        m_slope+=((y-((m*x)+c))*x)
        c_slope+=(y-((m*x)+c))
        
    m_slope*=(-2/N)
    c_slope*=(-2/N)
    
    # updating the m and c values and returning them
    
    m-=(learning_rate*m_slope)
    c-=(learning_rate*c_slope)
    
    return (m, c)

In [11]:
# cost function to calculate the average cost for a particular value of m and c 

def cost(m, c):
    cost=0
    N=len(X_train)
    
    for i in range(N):
        x=X_train[i]
        y=Y_train[i]
        
        cost+=((y-((m*x)+c))**2)
        
    cost/=N
    
    return cost

In [12]:
# gd function to calculate the optimal values of m and c for the best fit line

def gd(learning_rate):
    # starting with random values of m and c
    m, c=0, 0
    
    # changing the m and c values in each iteration with the help of the step gradient function 
    
    # finding the value of learning_rate i.e. alpha for which we just don't overshoot

    prev_cost=cost(m, c)

    while True:
        new_m, new_c=step_gradient(learning_rate, m, c)
    
        new_cost=cost(new_m, new_c)

        if new_cost>=prev_cost:
            learning_rate/=10
        else:
            break
    
    while True:        
        # printing the cost and calculating the new m and c values at each iteration

        prev_cost=cost(m, c)
              
        m, c=step_gradient(learning_rate, m, c)        
        
        new_cost=cost(m, c)

        # when the absolute difference between the new and the prev costs is lesser than 0.01, then we simply break and return the m and c values
        
        if abs(new_cost-prev_cost)<=0.01:
            break
        
    return (m, c)

In [13]:
# run function to run the gradient descent algorithm and return the optimal m and c values

def run():
    # taking initial value of the learning rate i.e. aplha as 0.1 
    
    learning_rate=0.1    
    
    # calling the gradient descent function to get the optimal values of m and c for the best fit  line and then returning them
    
    m, c=gd(learning_rate)
    
    return (m, c)

In [14]:
# running the gradient descent algorithm

m, c=run()

In [15]:
Y_pred=predict(X_test, m, c)

In [16]:
# storing the predictions in a csv file

df=pd.DataFrame(Y_pred)

df.to_csv("predictions2.csv", header=False, index=False)

In [17]:
# calculating the score of the algorithm

score=score(Y_pred, Y_test)

print(score)

0.5981963479283324
