In [36]:
# importing the needed packages/libraries/modules

import numpy as np
import pandas as pd
from sklearn import model_selection, datasets, preprocessing as pp

In [37]:
# loading the inbuilt boston dataset

data=datasets.load_boston()

# getting the input, output and feature names

X=data.data
Y=data.target
columns=data.feature_names

In [38]:
# loading the data into a dataframe and then describing it to analyze it better 

df=pd.DataFrame(X, columns=columns)

df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [39]:
# scaling the input data because of the different ranges of the features and we're standardising it beacause of the presence of outliers in the different features 

scaler=pp.StandardScaler()

X=scaler.fit_transform(X)

In [40]:
# splitting the data into training and testing 

X_train, X_test, Y_train, Y_test=model_selection.train_test_split(X, Y, random_state=1)

In [41]:
# score function which gives the coeffient of determination 

def score(Y_pred, Y_true):
    np_y_pred=np.array(Y_pred)
    np_y_true=np.array(Y_true)
    
    u=((np_y_true-np_y_pred)**2).sum()
    v=((np_y_true-np_y_true.mean())**2).sum()
    
    score=(1-(u/v))
    
    return score

In [42]:
# predict function to predict the output using the best fit line found out by gradient descent 

def predict(x, coeffs):
    M=x.shape[0]
    N=x.shape[1]

    y_pred=[0 for i in range(M)]        

    for i in range(M):
        sum=0

        for j in range(N):
            xij=x[i][j]
            mj=coeffs[j]

            sum+=(xij*mj)

        sum+=(coeffs[N])
    
        y_pred[i]=sum

    return y_pred

In [43]:
# step gradient function which makes change in the regression coefficents(m1, m2 till mn+1(c)) and makes us move one step closer to the optimal value of the cost function using a given learning rate 

def step_gradient(learning_rate, coeffs):
    M=(X_train.shape[0])
    N=len(columns)

    slope=[0 for i in range(N+1)]   

    for k in range(N+1):
        sum1=0

        for i in range(M):      
            sum2=0

            for j in range(N):
                xij=X_train[i][j]
                mj=coeffs[j]

                sum2+=(xij*mj)

            sum2+=coeffs[N]
            sum2*=-1
            
            yi=Y_train[i]

            sum2+=yi  
            
            xik=None

            if k==N:
                xik=1
            else:
                xik=X_train[i][k]
                                  
            sum2*=xik
            sum1+=sum2
                
        sum1/=(-2/M)

        slope[k]=sum1    

    np_coeffs=np.array(coeffs, dtype=float)
    np_slope=np.array(slope) 

    np_coeffs-=(learning_rate*np_slope)

    return np_coeffs

In [44]:
# cost function which finds the cost for a particular set of coefficients(m1, m2 till mn+1(c)) and we need to minmise this cost function to get the optimal values of the regression coefficients using the gradient descent algorithm 

def cost(coeffs):
    cost=0
    
    M=X_train.shape[0]
    N=len(columns)

    for i in range(M):        
        sum=0

        for j in range(N):
            xij=X_train[i][j]
            mj=coeffs[j]

            sum+=(mj*xij)

        sum+=(coeffs[N])
        sum*=-1

        yi=Y_train[i]  

        sum+=yi
        sum**=2

        cost+=sum
    
    cost/=M

    return cost


In [45]:
# gd function which implements the gradient descent algorithm 

def gd(learning_rate):
    N=len(columns)
    coeffs=[1 for i in range(N+1)]

    # finding the learning rate value for which we just don't overshoot and the cost begins to decrease

    prev_cost=cost(coeffs)  

    while True:
        new_coeffs=step_gradient(learning_rate, coeffs)

        new_cost=cost(new_coeffs)

        if new_cost>=prev_cost:
            learning_rate/=10
        else:
            break

    i=0

    while True:
        prev_coeffs=coeffs
        prev_cost=cost(coeffs)

        coeffs=step_gradient(learning_rate, coeffs)

        new_cost=cost(coeffs)

        # if the cost increases at some point, then we simply reduce the learning rate, reset coeffs to previous value and then repeat the process(this is the case when the learning rate choosen in the above loop takes us to the other side of the parabola(wrt axis) while descreasing the cost(in the case of single feature input))

        if new_cost>=prev_cost:
            learning_rate/=10
            coeffs=prev_coeffs

            continue    

        # if the absolute difference between the new and prev cost is <= 0.01(value considered after analyzing the decrease trend in the cost values), then we simply break

        if abs(new_cost-prev_cost)<=0.01:
            break        

        i+=1

    return coeffs

In [46]:
# run function to run the gradient descent algorithm and get the optimal coefficients 

def run():
    # initial value of learning rate 

    learning_rate=0.1

    coeffs=gd(learning_rate)

    return coeffs

In [47]:
# getting the optimal coefficients by running the gradient descent algorithm

coeffs=run()

In [48]:
# getting the predictions for the test data

Y_pred=predict(X_test, coeffs)

In [49]:
# storing the predictions in a csv file 

df=pd.DataFrame(Y_pred)

df.to_csv("predictions.csv", header=False, index=False)

In [50]:
# getting the score for the gradient descent algorithm 

score=score(Y_pred, Y_test)

print(score)

0.7829622528971131
