## Imports

In [82]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_percentage_error
from lightgbm import LGBMRegressor

import matplotlib.pyplot as plt

## Main Code

In [87]:

# Load the diabetes dataset
X, y = load_diabetes(return_X_y=True)

df = pd.DataFrame(data= np.c_[X, y], columns= ["Feature_"+str(i) for i in range(X.shape[1])] + ["Target"])
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit linear regression model to the training data
reg = LinearRegression().fit(X_train, y_train)
lightGBM=LGBMRegressor().fit(X_train, y_train)



# Make a prediction on the test data
predictions = reg.predict(X_test)

# predictions = lightGBM.predict(X_test)

In [76]:
df.head()

Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [78]:
data = pd.DataFrame({'y_test': y_test, 'predictions': predictions})

In [77]:
data.head()

Unnamed: 0,y_test,predictions
0,84.0,250.782942
1,57.0,84.569732
2,39.0,96.478183
3,292.0,196.569272
4,138.0,72.833045


Lets code funcions to calculate metrics

In [79]:
def mse(predictions,target):
    
    '''
    mse = (target-predicions)**2/N
    '''
    
    mse =((target-predictions)**2).sum()/len(target)
    return mse


def mae(predictions,target):
    
    '''
    mae = |target-predicions|/N
    '''
    
    mae =(np.abs(target-predictions)).sum()/len(target)
    return mae


def rmse(predictions,target):
    
    '''
    rmse = ((target-predicions)**2/N)**0.5
    '''
    
    rmse =(((target-predictions)**2).sum()/len(target))**0.5
    return rmse


def r2(predictions,target):
    
    '''
    R² = 1 - (Σ(y_i - ŷ_i)^2 / Σ(y_i - y_mean)^2)
    '''
    
    r2 =1-((((target-predictions)**2)).sum()/((target-target.mean())**2).sum())
    return r2

def mape(predictions,target):
    
    '''
    MAPE= (1/n) * Σ|(y_i - ŷ_i) / y_i|
    '''
    
    mape =(((np.abs(target-predictions))/target).sum()/len(target))
    return mape



In [80]:
print('mse',mse(data['predictions'],data['y_test']))
print('mae',mae(data['predictions'],data['y_test']))
print('rmse',rmse(data['predictions'],data['y_test']))
print('r2',r2(data['predictions'],data['y_test']))
print('mape',mape(data['predictions'],data['y_test']))


mse 2946.4267285313267
mae 42.55350042182533
rmse 54.28099785865517
r2 0.5203306294080065
mape 0.35308293415817177


Lets compare with results from sklearn

In [74]:


# Mean Squared Error
mse = mean_squared_error(data['y_test'], data['predictions'])
print("Mean Squared Error:", mse)

# Mean Absolute Error
mae = mean_absolute_error(data['y_test'], data['predictions'])
print("Mean Absolute Error:", mae)


# Root Mean Squared Error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# R-squared
r2 = r2_score(data['y_test'], data['predictions'])
print("R-squared:", r2)

# Mean Absolute Percentage Error
mape = mean_absolute_percentage_error(data['y_test'], data['predictions'])
print("Mean Absolute Percentage Error:", mape)

Mean Squared Error: 4111.807101484951
Mean Absolute Error: 50.49368425995852
Root Mean Squared Error: 64.12337406503927
R-squared: 0.39181622705952135
Mean Absolute Percentage Error: 0.47468189611275374


## Conclusion


Below are simple explanations for r2 and mape

**The R-squared (R²)** 
metric is a measure of how well the independent variables in a regression model are able to predict the dependent variable. It is a value between 0 and 1, where a value of 1 indicates that the model perfectly explains the variation in the dependent variable, and a value of 0 indicates that the model explains none of the variation in the dependent variable.

In simpler terms, R-squared measures the proportion of the variance in the dependent variable that is predictable from the independent variable(s). It tells us how well the model fits the data.

For example, if an R-squared value of 0.8 is obtained, it means that 80% of the variance in the dependent variable is explained by the independent variable(s).

It is important to note that a high R-squared value does not necessarily indicate that a model is a good fit for the data. It only indicates that the model explains a large portion of the variance in the dependent variable. Other factors, such as the presence of outliers or the ability of the model to make accurate predictions on new unseen data, should also be considered when evaluating the performance of a model.

R2 is from 0 to 1. Closer to 1its better

**The Mean Absolute Percentage Error (MAPE)** is a metric used to evaluate the accuracy of a model's predictions. It measures the average percentage difference between the predicted values and the actual values.

The formula for MAPE is:
MAPE = (1/n) * Σ|(y_i - ŷ_i) / y_i|

Where y_i is the actual value and ŷ_i is the predicted value for the i-th sample, and n is the total number of samples.

In simpler terms, MAPE is the average of the absolute percentage differences between the predicted values and the actual values. It is a measure of how close the predictions are to the actual values, with a lower value indicating a better fit.

For example, if a model has a MAPE of 10%, it means that the average difference between the predicted values and the actual values is 10% of the actual value.

It is important to note that MAPE is sensitive to outliers, if there are very large values in the dataset it will have a significant impact on the value of MAPE, so it should be used with care and it is not recommended to use it when the dataset has very large values.