# Imports

In [13]:
import math
import numpy as np
import pandas as pd
import plotly.express as px
import pickle

# Data Loading

In [14]:
df_train= pd.read_csv('/kaggle/input/train.csv') 
df_train.dropna(inplace=True)
X_train=df_train.x
y_train=df_train.y
df_test=pd.read_csv('/kaggle/input/test.csv')
X_test=df_test.x
y_test=df_test.y

plot the data here

In [15]:
plot = px.scatter(df_train, x='x', y='y', color='y')
plot.show()

# Data Preprocessing

## Standardize the data
### Why Use Standardization in Machine Learning?
    we standardize the data so that all the features involved are scaled to the same range. Without standardization, features with larger scales may dominate the learning process.It also makes it easier to compare and interpret the importance of each feature. Standardization ensures that the regularization term applies fairly to all features.



### How to Standardize Data
    first we calculate the mean and std of the given dataset.
    Then we use the formula :
                                x(standardized)=(x-mean)/std

In [16]:
mean = X_train.mean()
std = X_train.std()
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std
y_mean = y_train.mean()
y_std = y_train.std()
y_train= (y_train - y_mean) / y_std

## Reshaping data for the correct shape for the model

why cant we make the model without reshaping?

Reshaping is necessary as models, which use libraries like scikit-learn, expect data to be at a certain shape and form. So without reshaping, we could end up with value errors and index errors

# Model Implementation

# Linear Regression Model

Linear regression is a fundamental model in machine learning used for predicting a continuous output variable based on input features. The model function for linear regression is represented as:

$$f_{w,b}(x) = wx + b$$

In this equation, $f_{w,b}(x)$ represents the predicted output, $w$ is the weight parameter, $b$ is the bias parameter, and $x$ is the input feature.

## Model Training

To train a linear regression model, we aim to find the best values for the parameters $(w, b)$ that best fit our dataset.

### Forward Pass

The forward pass is a step where we compute the linear regression output for the input data $X$ using the current weights and biases. It's essentially applying our model to the input data.

### Cost Function

The cost function is used to measure how well our model is performing. It quantifies the difference between the predicted values and the actual values in our dataset. The cost function is defined as:

$$J(w,b) = \frac{1}{2m} \sum_{i=1}^{m}(f_{w,b}(x^{(i)}) - y^{(i)})^2$$

Here, $J(w, b)$ is the cost, $m$ is the number of training examples, $x^{(i)}$ is the input data for the $i$-th example, $y^{(i)}$ is the actual output for the $i$-th example, and $w$ and $b$ are the weight and bias parameters, respectively.

### Backward Pass (Gradient Computation)

The backward pass computes the gradients of the cost function with respect to the weights and biases. These gradients are crucial for updating the model parameters during training. The gradient formulas are as follows:

$$
\frac{\partial J(w,b)}{\partial b} = \frac{1}{m} \sum_{i=0}^{m-1} (f_{w,b}(X^{(i)}) - y^{(i)})
$$

$$
\frac{\partial J(w,b)}{\partial w} = \frac{1}{m} \sum_{i=0}^{m-1} (f_{w,b}(X^{(i)}) - y^{(i)})X^{(i)}
$$

## Training Process

The training process involves iteratively updating the weights and biases to minimize the cost function. This is typically done through an optimization algorithm like gradient descent. The update equations for parameters are:

$$w \leftarrow w - \alpha \frac{\partial J}{\partial w}$$

$$b \leftarrow b - \alpha \frac{\partial J}{\partial b}$$

Here, $\alpha$ represents the learning rate, which controls the step size during parameter updates.

By iteratively performing the forward pass, computing the cost, performing the backward pass, and updating the parameters, the model learns to make better predictions and fit the data.


In [17]:
class LinearRegression:
    
    def __init__(self, learning_rate=0.001):
        np.random.seed(1)
        self.learning_rate = learning_rate

    def initialize_parameters(self):
        self.m=0
        self.b=0

    def forward(self,X):
        return self.m*X+self.b

    def compute_cost(self,predictions):
        n = self.X.shape[0]
        return (np.sum((self.y - predictions)**2))/(2 * n)

    def backward(self,predictions):
        n = self.X.shape[0]
        self.m_grad = (self.X@(predictions - self.y)) / n
        self.b_grad=np.mean(predictions - self.y)

    def fit(self, X, y, iterations, plot_cost=True):

        self.X = X
        self.y = y

        self.initialize_parameters()
        weights = []
        costs = []
        for i in range(iterations):
            predictions = self.forward(self.X)

            cost = self.compute_cost(predictions)
            costs.append(cost)
            
            self.backward(predictions)

            self.m = self.m - self.learning_rate * self.m_grad
            self.b = self.b - self.learning_rate * self.b_grad

            if i % 1000 == 0:
                print("Cost after iteration {}: {}".format(i, cost))

        if plot_cost:
            fig = px.line(y=costs,title="Cost vs Iteration",template="plotly_dark")
            fig.update_layout(
                title_font_color="#41BEE9", 
                xaxis=dict(color="#41BEE9",title="Iterations"), 
                yaxis=dict(color="#41BEE9",title="cost")
            )
            fig.show()


    def predict(self, X):
        return self.forward(X)

        
    def save_model(self, filename=None):
        model_data = {
            'learning_rate': self.learning_rate,
            'W': self.m,
            'b': self.b
        }

        with open(filename, 'wb') as file:
            pickle.dump(model_data, file)

    @classmethod
    def load_model(cls, filename):
        with open(filename, 'rb') as file:
            model_data = pickle.load(file)
        loaded_model = cls(model_data['learning_rate'])
        loaded_model.m = model_data['W']
        loaded_model.b = model_data['b']

        return loaded_model


In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train,10000)

Cost after iteration 0: 0.4992846924177398
Cost after iteration 1000: 0.0717099714860658
Cost after iteration 2000: 0.013736096133765275
Cost after iteration 3000: 0.00587555139426196
Cost after iteration 4000: 0.004809758168843606
Cost after iteration 5000: 0.004665249710843328
Cost after iteration 6000: 0.004645656140593881
Cost after iteration 7000: 0.0046429994933606875
Cost after iteration 8000: 0.004642639284657945
Cost after iteration 9000: 0.004642590444786429


In [19]:
lr.save_model('model.pkl')

# Evaluation



### 1. Mean Squared Error (MSE)

**Formula:**
$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_{\text{true}_i} - y_{\text{pred}_i})^2
$$

**Description:**
 - fill this

**Interpretation:**
-  fill this

### 2. Root Mean Squared Error (RMSE)

**Formula:**
$$
\text{RMSE} = \sqrt{\text{MSE}}
$$

**Description:**
- fill this
**Interpretation:**
- fill this


### 3. R-squared ($R^2$)

**Formula:**
$$
R^2 = 1 - \frac{\text{SSR}}{\text{SST}}
$$

**Description:**
 fill this 

**Interpretation:**
 fill this


In [20]:
class RegressionMetrics:
    @staticmethod
    def mean_squared_error(y_true, y_pred):
        return np.mean((y_test-y_pred)**2)
        
    @staticmethod
    def root_mean_squared_error(y_true, y_pred):
        return (np.mean((y_test-y_pred)**2))**0.5
        
    @staticmethod
    def r_squared(y_true, y_pred):
        Residual_sum=np.sum((y_true-y_pred)**2)
        Total_sum=np.sum((y_true-np.mean(y_true))**2)
        return 1-Residual_sum/Total_sum

In [21]:
model=LinearRegression.load_model('model.pkl')

In [22]:
y_pred = model.predict(X_test)
#denormalizing y_pred:
y_pred = y_pred * y_std + y_mean 
mse_value = RegressionMetrics.mean_squared_error(y_test, y_pred)
rmse_value = RegressionMetrics.root_mean_squared_error(y_test, y_pred)
r_squared_value = RegressionMetrics.r_squared(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse_value}")
print(f"Root Mean Squared Error (RMSE): {rmse_value}")
print(f"R-squared (Coefficient of Determination): {r_squared_value}")

Mean Squared Error (MSE): 9.43396871949966
Root Mean Squared Error (RMSE): 3.0714766350242124
R-squared (Coefficient of Determination): 0.9888002020186158
