In [1]:
import math
import numpy as np
import pandas as pd
import plotly.express as px
import pickle

In [2]:
df =pd.read_csv("wine_dataset.csv")
df=df.dropna()
x=["total_phenols","flavanoids","od280/od315_of_diluted_wines"]
X=df[x].values
y=df['target'].values

In [3]:
def train_test_split(X, y, random_state=42, test_size=0.2):
    """
    Splits the data into training and testing sets.

    Parameters:
        X (numpy.ndarray): Features array of shape (n_samples, n_features).
        y (numpy.ndarray): Target array of shape (n_samples,).
        random_state (int): Seed for the random number generator. Default is 42.
        test_size (float): Proportion of samples to include in the test set. Default is 0.2.

    Returns:
        Tuple[numpy.ndarray]: A tuple containing X_train, X_test, y_train, y_test.
    """

    # Fill the details based on this

    # Get number of samples
    n_samples=len(X)

    # Set the seed for the random number generator
    np.random.seed(random_state)

    # Shuffle the indices
    #shuffled_indices = np.random.permutation(n_samples)# fill here
    #alternative shuffling method
    indices = np.arange(n_samples) 
    np.random.shuffle(indices)
    
    # Determine the size of the test set
    test_size = 0.2 # fill here

    # Split the indices into test and train
    test_indices = indices[:int(n_samples*test_size)]# fill here
    train_indices = indices[int(n_samples*test_size):]# fill here

    # Split the features and target arrays into test and train
    X_train, X_test = X[train_indices] , X[test_indices]
    y_train, y_test = y[train_indices],y[test_indices]# fill here

    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [4]:
#Standardizing data
'''
def standardize_data(X_train, X_test):
    """
    Standardizes the input data using mean and standard deviation.

    Parameters:
        X_train (numpy.ndarray): Training data.
        X_test (numpy.ndarray): Testing data.

    Returns:
        Tuple of standardized training and testing data.
    """

    # Calculate the mean and standard deviation using the training data
    mean = np.sum(X_train)/len(X_train)# fill here
    std = (np.sum((X_train-mean)**2)/len(X_train))**0.5
    
    # Standardize the data
    X_train = (X_train-mean)/std# fill here
    X_test = (X_test-mean)/std # fill here
    
    return X_train, X_test

X_train, X_test = standardize_data(X_train, X_test)
'''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#scaler = MinMaxScaler(feature_range=(0,1))


In [31]:
class LinearRegression:
    """
    Logistic Regression model.

    Parameters:
        learning_rate (float): Learning rate for the model.

    Methods:
        initialize_parameter(): Initializes the parameters of the model.
        sigmoid(z): Computes the sigmoid activation function for given input z.
        forward(X): Computes forward propagation for given input X.
        compute_cost(predictions): Computes the cost function for given predictions.
        compute_gradient(predictions): Computes the gradients for the model using given predictions.
        fit(X, y, iterations, plot_cost): Trains the model on given input X and labels y for specified iterations.
        predict(X): Predicts the labels for given input X.
    """

    def __init__(self, learning_rate=0.0001):
        np.random.seed(1)
        self.learning_rate = learning_rate

    def initialize_parameter(self):
        """
        Initializes the parameters of the model.
        """
        self.W = np.zeros(self.X.shape[1])
        self.b = 0.0


    def forward(self, X):
        """
        Computes forward propagation for given input X.

        Parameters:
            X (numpy.ndarray): Input array.

        Returns:
            numpy.ndarray: Output array.
        """
#         print(X.shape, self.W.shape)
        Z = np.matmul(X, self.W) + self.b
        return Z

    def compute_cost(self, predictions):
        """
        Computes the cost function for given predictions.

        Parameters:
            predictions (numpy.ndarray): Predictions of the model.

        Returns:
            float: Cost of the model.
        """
        m=len(y)
        cost = (1/(2*m))*np.sum((predictions-self.y)**2)
        return cost

    def compute_gradient(self, predictions):
        """
        Computes the gradients for the model using given predictions.

        Parameters:
            predictions (numpy.ndarray): Predictions of the model.
        """
        # get training shape
        m = self.X.shape[0]
        # compute gradients
        #fill here
        self.dW = (1/m)*np.sum(np.dot((predictions-self.y),self.X))
        self.db = (1/m)*np.sum(predictions-self.y)
        # scale gradients
        # fill here
        
        
    def fit(self, X, y, iterations, plot_cost=True):
        """
        Trains the model on given input X and labels y for specified iterations.

        Parameters:
            X (numpy.ndarray): Input features array of shape (n_samples, n )
            y (numpy.ndarray): Labels array of shape (n_samples, 1)
            iterations (int): Number of iterations for training.
            plot_cost (bool): Whether to plot cost over iterations or not.

        Returns:
            None.
        """
        self.X = X
        self.y = y

        self.initialize_parameter()

        costs = []
        for i in range(iterations):
            # forward propagation
            predictions = self.forward(self.X)

            # compute cost
            cost = self.compute_cost(predictions)
            costs.append(cost)

            # compute gradients
            self.compute_gradient(predictions)

            # update parameters
            self.W = self.W - self.learning_rate * self.dW
            self.b = self.b - self.learning_rate * self.db

            # print cost every 100 iterations
            if i % 10000 == 0:
                print("Cost after iteration {}: {}".format(i, cost))

        if plot_cost:
            plot = px.scatter(y=costs,title="Cost vs Iteration")
            plot.show()

    def predict(self, X):
        """
        Predicts the labels for given input X.

        Parameters:
            X (numpy.ndarray): Input features array.

        Returns:
            numpy.ndarray: Predicted labels.
        """
        
        # Write the code to predict the labels
        return self.forward(X)
    
    def save_model(self, filename=None):
        """
        Save the trained model to a file using pickle.

        Parameters:
            filename (str): The name of the file to save the model to.
        """
        model_data = {
            'learning_rate': self.learning_rate,
            'W': self.W,
            'b': self.b
        }

        with open(filename, 'wb') as file:
            pickle.dump(model_data, file)

    @classmethod
    def load_model(cls, filename):
        """
        Load a trained model from a file using pickle.

        Parameters:
            filename (str): The name of the file to load the model from.

        Returns:
            LogisticRegression: An instance of the LogisticRegression class with loaded parameters.
        """
        with open(filename, 'rb') as file:
            model_data = pickle.load(file)

        # Create a new instance of the class and initialize it with the loaded parameters
        loaded_model = cls(model_data['learning_rate'])
        loaded_model.W = model_data['W']
        loaded_model.b = model_data['b']

        return loaded_model

In [33]:
iterations=100000
lr = LinearRegression()
lr.fit(X_train, y_train,iterations)

Cost after iteration 0: 0.6095505617977528
Cost after iteration 10000: 0.12012136716948422
Cost after iteration 20000: 0.07698162186588399
Cost after iteration 30000: 0.07114388061318178
Cost after iteration 40000: 0.07035390725383942
Cost after iteration 50000: 0.07024700667680847
Cost after iteration 60000: 0.07023254070370392
Cost after iteration 70000: 0.07023058314291544
Cost after iteration 80000: 0.07023031824236484
Cost after iteration 90000: 0.0702302823955589


In [34]:
lr.save_model('winedine.pkl')

In [35]:
model=lr.load_model('winedine.pkl')

In [36]:
class RegressionMetrics:
    @staticmethod
    def mean_squared_error(y_true, y_pred):
        """
        Calculate the Mean Squared Error (MSE).

        Args:
            y_true (numpy.ndarray): The true target values.
            y_pred (numpy.ndarray): The predicted target values.

        Returns:
            float: The Mean Squared Error.
        """
        n=len(y_true)
        mse = float((1/n)*np.sum((y_true-y_pred)**2))
        return mse
    @staticmethod
    def root_mean_squared_error(y_true, y_pred):
        """
        Calculate the Root Mean Squared Error (RMSE).

        Args:
            y_true (numpy.ndarray): The true target values.
            y_pred (numpy.ndarray): The predicted target values.

        Returns:
            float: The Root Mean Squared Error.
        """
        n=len(y_true)
        mse = float((1/n)*np.sum((y_true-y_pred)**2))
        rmse= (mse)**0.5
        return float(rmse)
    @staticmethod
    def r_squared(y_true, y_pred):
        y_avg=np.sum(y_true)
        ssr= np.sum((y_true-y_pred)**2)
        sst=np.sum((y_true-y_avg)**2)
        rsq = 1 - (ssr/sst)
        return rsq

In [37]:
y_pred = model.predict(X_test)
mse_value = RegressionMetrics.mean_squared_error(y_test, y_pred)
rmse_value = RegressionMetrics.root_mean_squared_error(y_test, y_pred)
r_squared_value = RegressionMetrics.r_squared(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse_value}")
print(f"Root Mean Squared Error (RMSE): {rmse_value}")
print(f"R-squared (Coefficient of Determination): {r_squared_value}")

Mean Squared Error (MSE): 0.13753202682663124
Root Mean Squared Error (RMSE): 0.3708531068046097
R-squared (Coefficient of Determination): 0.9998381758576302
