In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import plotly.graph_objects as go

### Example data

In [2]:
# x1 = np.array(list(range(10))).reshape(-1, 1)
# x2 = np.array(list(range(0, 100, 10))).reshape(-1, 1)
# w1_true = 2
# w2_true = 5

# X = np.concatenate([x1, x2], axis=1).astype(float)
# W_true = np.array([w1_true, w2_true]).reshape(-1, 1)

# y_true = np.matmul(X, W_true)
# N = len(y_true)

# Linear Regression

Cost = 1/N * $\Sigma_{i=1}^m (y\_pred^{(i)} - y\_true^{(i)})^2$

In [19]:
class CLinearRegression:
    def __init__(self):
        self.W = None
        self.isNormalize = False
        
    
    def normalize(self, X):
        for i in range(X.shape[1]):
            mean = np.mean(X[:, i])
            X[:, i] = (X[:, i] - mean) / np.std(X[:, i])
        return X
    
    def rmse_error(self, y_true, y_pred):
        error = y_pred - y_true
        M = len(y_true)
        return (1/(2*M)) * np.sqrt(np.sum(error ** 2))
    
    # gradient_descent
    def fit(self, X, y_true, lr=0.001, iterations=1000, normalize=False):
        '''
        Parameters
        ---------------
        X : array
            Input array of size m X n, where m is the total number of training samples, and n is the total
            number of input features.
        
        y_true : array
            True Values for each sample.
        
        W : array
            The weights/coefficients, corresponding to each input feature, to be trained using Gradient Descent.
        
        lr : float
            lr is Learning Rate, the rate at which we decrease our weights.
            
        normalize : boolean
            Normalize the input samples, X, or not.
        '''
        
        M = X.shape[0]
        losses = []
        
        # Adding 1 dimension for the intercept W0
        W = np.random.randn(X.shape[1]+1)
        if normalize:
            X = self.normalize(X)
            self.isNormalize = True
#             y_true = (y_true - np.mean(y_true)) / np.std(y_true)
            
        # Storing our predictions
        y_pred = np.zeros(len(y_true))
        
        # Adding 1's for the intercept
        X = np.concatenate([np.ones(len(X)).reshape(-1, 1), X], axis=1)
        
        # Number of Gradient Descent steps(iterations) to take
        for i in range(iterations):
            
#             Predictions for each sample
            y_pred = np.matmul(X, W)
            error = y_pred - y_true

#             Cost to be minimized
            loss = self.rmse_error(y_pred, y_true)
            losses.append(loss)
#             if i % 100 == 0:
#                 print(f'Cost ar iteration {i} is {round(loss, 3)}')
            
            # Derivative of the cost with respect to all weights in W
            weight_ch = np.matmul(X.T, error) / M
            W -= lr * weight_ch

        self.W = W
        fig = go.Figure(go.Scatter(x=list(range(iterations)), y=losses))
#         fig.update_xaxes(title= 'Iteration')
#         fig.update_yaxes(title= 'Loss')
        fig.update_layout({'xaxis': {'title': 'Iteration'}, 'yaxis': {'title': 'Loss'}, 
                          'title': {'text' :'Loss after each Iteration', 'x': 0.5}, 
                           'template': 'simple_white'})
        return self, fig
    
    def predict(self, X_test):
        return np.matmul(X_test, self.W[1:]) + self.W[0]
    
    def r2_score(self, X, y_true):
        if self.isNormalize:
            X_copy = X.copy()
            X_copy = self.normalize(X_copy)
        y_preds = self.predict(X_copy).flatten()
        return metrics.r2_score(y_true, y_preds)

### Loading the KC house dataset

In [20]:
df = pd.read_csv('kc_house_data.csv')
df = df.drop(columns=['date', 'id']) # Dropping date column as it seems unnecessary
df_train = df.drop(columns=['price']) # Extracting input features
input_features = df_train.columns.values # Extracting input feature names
X = df_train.values # Extracting input feature values

target_feature = 'price'
y_true = df[target_feature].values.flatten() # Extracting target values

X_copy = X.copy() # Making a copy of input feature values to be used for training.

## Train-Test split

In [21]:
train_ratio = 0.8
N = len(X)
train_len = int(train_ratio * N)

X_train = X[ :train_len]
y_train_true = y_true[:train_len]

X_test = X[train_len: ]
y_test_true = y_true[train_len: ]

X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

In [22]:
clr = CLinearRegression()
clr, loss_fig = clr.fit(X_train_copy, y_train_true.copy(), lr=0.001, iterations=5000, normalize=True)

In [23]:
loss_fig.show()

In [24]:
features_impact_dict = {}
for i, feature in enumerate(input_features):
    features_impact_dict[feature] = clr.W[i+1] / max(clr.W[1:])

In [25]:
fig = go.Figure(go.Bar(x=input_features, y = list(features_impact_dict.values())))
fig.update_layout({'title': {'text': f'Correlation and Impact of input features on {target_feature}', 'x': 0.5}, 
                  'xaxis': {'title': 'Input Feature'}, 'yaxis': {'title': 'Impact Score', 'dtick': 0.2}})

In [29]:
clr_test_r2 = clr.r2_score(X_test, y_test_true)
print(f'R2 score for test dataset with Custom Linear Regression is : {round(clr_test_r2, 3)}')

R2 score for test dataset with Custom Linear Regression is : 0.677


# Comparing with sklearn's Linear Regression Class

## SKLearn Linear Regression

In [33]:
lin_reg_sk = LinearRegression(normalize=False).fit(X_copy, y_true)
lin_reg_sk.coef_

array([-3.57665414e+04,  4.11442785e+04,  1.10442865e+02,  1.28597869e-01,
        6.68955012e+03,  5.82960458e+05,  5.28709424e+04,  2.63856491e+04,
        9.58904452e+04,  7.07852244e+01,  3.96576400e+01, -2.62022321e+03,
        1.98125837e+01, -5.82419866e+02,  6.02748226e+05, -2.14729828e+05,
        2.16814005e+01, -3.82641850e-01])

In [34]:
skl_test_r2 = lin_reg_sk.score(X_test, y_test_true)
print(f'R2 score for test dataset with SKLearn\'s Linear Regression is : {round(skl_test_r2, 3)}')

R2 score for test dataset with SKLearn's Linear Regression is : 0.698
