In [174]:
import pandas as pd
import numpy as np

In [175]:
data = pd.read_csv('lin-reg-from-scratch.csv')
data.head()

Unnamed: 0,Avg_hours_studied,last_exam_Score,hours_slept_before_exam,percentage_result
0,7,99,9,91
1,4,82,4,65
2,8,51,7,45
3,5,52,5,36
4,7,75,8,66


In [176]:
data.isna().sum()

Avg_hours_studied          0
last_exam_Score            0
hours_slept_before_exam    0
percentage_result          0
dtype: int64

In [177]:
def compute_cost(x,y,w,b,lbda = 0.01):
    m,n = x.shape
    cost = 0
    reg = 0
    for i in range(m):
        loss_i = (np.dot(w,x[i]) + b) - y[i]
        cost += (loss_i)**2
    cost /= 2*m
    for i in range(n):
        reg += w[i] ** 2
    reg = (lbda / (2 * m)) * reg 
    cost += reg
    return cost

In [178]:
def compute_gradient(x, y, w, b, lbda = 0.01):
    m, n = x.shape
    dj_dw = np.zeros((n,))
    dj_db = 0

    for i in range(m):
        err = (np.dot(w, x[i]) + b - y[i])
        for j in range(n):
            dj_dw[j] += err * x[i, j]
        dj_db += err
    
    dj_dw /= m
    dj_db /= m

    for j in range(n):
        dj_dw[j] = dj_dw[j] + (lbda/m) * w[j]
    return dj_dw, dj_db

In [179]:
def gradient_descent(x,y,alpha,num_of_iter):
    m,n = x.shape
    w = np.zeros(shape=n)
    b = 0
    for i in range(num_of_iter):
        dj_dw,dj_db = compute_gradient(x,y,w,b)
        w -= alpha*dj_dw
        b -= alpha*dj_db
        if(i % 1000 == 0):
            print(f'Cost at {i} : {compute_cost(x,y,w,b)}', end='\n')
    return w,b

In [180]:
def predict(x_train,y_train,apha,num_of_iter, x_test):
    w,b = gradient_descent(x_train,y_train,apha,num_of_iter)
    m = x_test.shape[0]
    yhat = np.zeros(shape=m)
    for i in range(m):
        yhat[i] += np.dot(w,x_test[i]) + b
    return yhat

In [181]:
x= data.drop(['percentage_result'],axis=1)
y = data['percentage_result']
x = x.to_numpy()
y = y.to_numpy()

In [182]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_norm = scaler.fit_transform(x)

In [183]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_norm,y,train_size = 0.2,random_state=100)

In [185]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
result = predict(x_train,y_train, 0.01, 10001, x_test)
mse = mean_squared_error(result,y_test)
rmse = root_mean_squared_error(result,y_test)
rsq = r2_score(result, y_test)
print(f'mse: {mse} rmse: {rmse} r2: {rsq}')

Cost at 0 : 1603.218636058449
Cost at 1000 : 22.24664139131719
Cost at 2000 : 7.216025081949513
Cost at 3000 : 3.7064971992959284
Cost at 4000 : 2.7082629006690007
Cost at 5000 : 2.391436920672397
Cost at 6000 : 2.285861607700832
Cost at 7000 : 2.2499885100611894
Cost at 8000 : 2.2377076009068317
Cost at 9000 : 2.2334913220468837
Cost at 10000 : 2.2320422236466926
mse: 4.592997566179616 rmse: 2.1431279864206934 r2: 0.9874619137479597
