In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import plotly.express as px
import pickle

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wine-dataset/wine_dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/wine-dataset/wine_dataset.csv') 
df.dropna(inplace=True)

In [3]:
cols=['alcalinity_of_ash','flavanoids','od280/od315_of_diluted_wines']
X = df[cols].values
y = df['target'].values

In [4]:
def train_test_split(X, y, random_state=42, test_size=0.2):

    n_samples = len(X)

    np.random.seed(42)


    shuffled_indices=np.arange(n_samples)
    np.random.shuffle(shuffled_indices)

    test_size = 0.2

    train_indices = shuffled_indices[int(0.2*len(shuffled_indices)):]
    test_indices = shuffled_indices[:int(0.2*len(shuffled_indices))]

    X_train, X_test = X[train_indices],X[test_indices]
    y_train, y_test = y[train_indices],y[test_indices]
    return X_train, X_test, y_train, y_test


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [6]:
def standardize_data(X_train, X_test):

    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    
    return X_train, X_test

X_train, X_test = standardize_data(X_train, X_test)

In [7]:
class LinearRegression:

    def __init__(self, learning_rate=0.001):
        np.random.seed(1)
        self.learning_rate = learning_rate

    def initialize_parameter(self):
        self.W = np.zeros(self.X.shape[1])
        self.b = 0.0


    def forward(self, X):
        Z = np.matmul(X, self.W) + self.b
        return Z

    def compute_cost(self, predictions):
        m = self.X.shape[0]
        cost = np.sum((predictions-self.y)**2) 
        cost = cost / (2*m)
        return cost

    def compute_gradient(self, predictions):
        m = self.X.shape[0]
        self.dW = np.dot(self.X.T, (predictions - self.y)) / m
        self.db=np.mean(predictions - self.y)


    def fit(self, X, y, iterations, plot_cost=True):

        self.X = X
        self.y = y

        self.initialize_parameter()
        costs = []
        for i in range(iterations):
            predictions = self.forward(self.X)

            cost = self.compute_cost(predictions)
            costs.append(cost)
            
            self.compute_gradient(predictions)

            self.W = self.W - self.learning_rate * self.dW
            self.b = self.b - self.learning_rate * self.db

            if i % 1000 == 0:
                print("Cost after iteration {}: {}".format(i, cost))

        if plot_cost:
            fig = px.line(y=costs,title="Cost vs Iteration",template="plotly_dark")
            fig.update_layout(
                title_font_color="#41BEE9", 
                xaxis=dict(color="#41BEE9",title="Iterations"), 
                yaxis=dict(color="#41BEE9",title="cost")
            )
            fig.show()


    def predict(self, X):
        predictions=self.forward(X)
        return predictions
    
    def save_model(self, filename=None):
        model_data = {
            'learning_rate': self.learning_rate,
            'W': self.W,
            'b': self.b
        }

        with open(filename, 'wb') as file:
            pickle.dump(model_data, file)

    @classmethod
    def load_model(cls, filename):
        with open(filename, 'rb') as file:
            model_data = pickle.load(file)
        loaded_model = cls(model_data['learning_rate'])
        loaded_model.W = model_data['W']
        loaded_model.b = model_data['b']

        return loaded_model

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train,10000)

Cost after iteration 0: 0.7587412587412588
Cost after iteration 1000: 0.1216660994770786
Cost after iteration 2000: 0.06352272667352717
Cost after iteration 3000: 0.05614621995200161
Cost after iteration 4000: 0.055137494865008976
Cost after iteration 5000: 0.05498705244215874
Cost after iteration 6000: 0.0549569744554097
Cost after iteration 7000: 0.05494623377779784
Cost after iteration 8000: 0.054940207136875543
Cost after iteration 9000: 0.05493625661967097


In [9]:
lr.save_model("model.pkl")

In [10]:
class RegressionMetrics:
    @staticmethod
    def mean_squared_error(y_true, y_pred):
        return np.mean((y_test-y_pred)**2)
        
    @staticmethod
    def root_mean_squared_error(y_true, y_pred):
        return (np.mean((y_test-y_pred)**2))**0.5
        
    @staticmethod
    def r_squared(y_true, y_pred):
        Residual_sum=np.sum((y_true-y_pred)**2)
        Total_sum=np.sum((y_true-np.mean(y_true))**2)
        return 1-Residual_sum/Total_sum
        

In [12]:
model = LinearRegression.load_model("model.pkl")

In [13]:
y_pred = model.predict(X_test)
mse_value = RegressionMetrics.mean_squared_error(y_test, y_pred)
rmse_value = RegressionMetrics.root_mean_squared_error(y_test, y_pred)
r_squared_value = RegressionMetrics.r_squared(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse_value}")
print(f"Root Mean Squared Error (RMSE): {rmse_value}")
print(f"R-squared (Coefficient of Determination): {r_squared_value}")


Mean Squared Error (MSE): 0.12635358327095003
Root Mean Squared Error (RMSE): 0.3554624920732848
R-squared (Coefficient of Determination): 0.7819955781592763
