In [16]:
import math
import numpy as np
import pandas as pd
import plotly.express as px
import pickle
from model import LinearRegression
from evaluation import RegressionMetrics

In [3]:
train_data = pd.read_csv('./dataset/train.csv')
test_data = pd.read_csv('./dataset/test.csv')

# Remove rows with missing values
train_data = train_data.dropna()
test_data = test_data.dropna()

In [4]:
train_data.head()

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


In [5]:
px.scatter(x=train_data['x'], y=train_data['y'], template='seaborn')

In [6]:
X_train = train_data['x'].values
y_train = train_data['y'].values

# Set testing data and target
X_test = test_data['x'].values
y_test = test_data['y'].values

In [7]:
def apply_standardize(data_points):
    mean_data = np.mean(data_points)
    std_data = np.std(data_points)
    return (data_points - mean_data) / std_data

In [8]:
X_train = apply_standardize(X_train)
X_test = apply_standardize(X_test)
print(X_train.shape)
print(X_test.shape)

(699,)
(300,)


In [9]:
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
print(X_train.shape)
print(X_test.shape)

(699, 1)
(300, 1)


In [10]:
from model import LinearRegression
lr = LinearRegression(0.01)
lr.fit(X_train, y_train, 10000)

Iteration: 0, Cost: 1670.475539679181
Iteration: 100, Cost: 227.21658655459126
Iteration: 200, Cost: 33.84922127911624
Iteration: 300, Cost: 7.941924551367299
Iteration: 400, Cost: 4.4708733593347425
Iteration: 500, Cost: 4.005823049210875
Iteration: 600, Cost: 3.9435157598678825
Iteration: 700, Cost: 3.9351678495004294
Iteration: 800, Cost: 3.934049399183655
Converged after 863 iterations.


In [11]:
lr.save_model('model.pkl')

### Evaluation

In [12]:
model = LinearRegression.load_model("model.pkl")

In [17]:
y_pred = model.predict(X_test)
mse_value = RegressionMetrics.mean_squared_error(y_test, y_pred)
rmse_value = RegressionMetrics.root_mean_squared_error(y_test, y_pred)
r_squared_value = RegressionMetrics.r_squared(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse_value}")
print(f"Root Mean Squared Error (RMSE): {rmse_value}")
print(f"R-squared (Coefficient of Determination): {r_squared_value}")

Mean Squared Error (MSE): 10.79337662568745
Root Mean Squared Error (RMSE): 3.285327476171508
R-squared (Coefficient of Determination): 0.9871863431670246
