## Bechmarking MLRWRapper vs Sklearn

In [37]:
import pandas as pd
from sklearn.linear_model import LinearRegression #for benchmarking
from MLR.mlr_wrapper import MLRWrapper

### Data exploration

In [11]:
data = pd.read_csv("USDA_Clean.csv",index_col=0)

In [12]:
data

Unnamed: 0,ID,Description,Calories,Protein,TotalFat,Carbohydrate,Sodium,Cholesterol,Sugar,Calcium,Iron,Potassium,VitaminC,VitaminE,VitaminD,HighSodium,HighCals,HighSugar,HighProtein,HighFat
1,1001,"BUTTER,WITH SALT",717,0.85,81.11,0.06,714,215,0.06,24,0.02,24,0.0,2.32,1.5,1,1,0,0,1
2,1002,"BUTTER,WHIPPED,WITH SALT",717,0.85,81.11,0.06,827,219,0.06,24,0.16,26,0.0,2.32,1.5,1,1,0,0,1
3,1003,"BUTTER OIL,ANHYDROUS",876,0.28,99.48,0.00,2,256,0.00,4,0.00,5,0.0,2.80,1.8,0,1,0,0,1
4,1004,"CHEESE,BLUE",353,21.40,28.74,2.34,1395,75,0.50,528,0.31,256,0.0,0.25,0.5,1,1,0,1,1
5,1005,"CHEESE,BRICK",371,23.24,29.68,2.79,560,94,0.51,674,0.43,136,0.0,0.26,0.5,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7053,80200,"FROG LEGS,RAW",73,16.40,0.30,0.00,58,50,0.00,18,1.50,285,0.0,1.00,0.2,0,0,0,1,0
7054,83110,"MACKEREL,SALTED",305,18.50,25.10,0.00,4450,95,0.00,66,1.40,520,0.0,2.38,25.2,1,1,0,1,1
7055,90240,"SCALLOP,(BAY&SEA),CKD,STMD",111,20.54,0.84,5.41,667,41,0.00,10,0.58,314,0.0,0.00,0.0,1,0,0,1,0
7056,90560,"SNAIL,RAW",90,16.10,1.40,2.00,70,50,0.00,10,3.50,382,0.0,5.00,0.0,0,0,0,1,0


### For our **Multiple Linear Regression Model** we'll be using the formula:

$Calories = {\beta}0 + {\beta}1{\cdot}Protein + {\beta}2{\cdot}TotalFat + {\beta}3{\cdot}Carbohydrate + {\beta}4{\cdot}Sodium + {\beta}5{\cdot}Cholesterol$

Therefore we'll be using only these columns from our dataset

In [14]:
data = data[['Calories','Protein','TotalFat',"Carbohydrate","Sodium", "Cholesterol"]]

In [15]:
data

Unnamed: 0,Calories,Protein,TotalFat,Carbohydrate,Sodium,Cholesterol
1,717,0.85,81.11,0.06,714,215
2,717,0.85,81.11,0.06,827,219
3,876,0.28,99.48,0.00,2,256
4,353,21.40,28.74,2.34,1395,75
5,371,23.24,29.68,2.79,560,94
...,...,...,...,...,...,...
7053,73,16.40,0.30,0.00,58,50
7054,305,18.50,25.10,0.00,4450,95
7055,111,20.54,0.84,5.41,667,41
7056,90,16.10,1.40,2.00,70,50


#### Our target col is **Calories** 

We then fit the model

In [17]:
model = MLRWrapper(data, target_col='Calories')
model.fit()

In [21]:
for summary in model.get_model_summary():
    display(summary)

'Calories = 3.9883 + 3.9892 * Protein + 8.7717 * TotalFat + 3.7432 * Carbohydrate + 0.00033830208682474294 * Sodium + 0.011013810996956724 * Cholesterol'

Unnamed: 0,coeffs,P Value
b0,3.988275,1.867166e-16
Protein,3.989199,0.0
TotalFat,8.771698,0.0
Carbohydrate,3.7432,0.0
Sodium,0.000338,0.1223386
Cholesterol,0.011014,3.050157e-08


Unnamed: 0,Adjusted Rsquared,Rsquared,Mean Absolute Error,Mean Squared Error,F Test,RSS,TSS
Values,0.987652,0.987661,8.495767,358.135274,100922.221369,2257685.0,182976700.0


### Testing model on **Sklearn** 

In [54]:
Y = data['Calories']
X = data.drop(columns=['Calories'])

In [55]:
model = LinearRegression()
model.fit(X, Y)
print(f"Model coeffs: {model.coef_}")
print(f"Model intercept: {model.intercept_}")

Model coeffs: [3.98919944e+00 8.77169801e+00 3.74320006e+00 3.38302087e-04
 1.10138110e-02]
Model intercept: 3.988275261317881


 ### Benchmarking **mlr_cpp** vs **sklearn**


In [62]:
import time
from sklearn.metrics import r2_score

In [101]:
def benchmark_models(data, target='Calories'):
    Y = data[target]
    X = data.drop(columns=[target])

    # mlr_cpp
    cpp_model = MLRWrapper(data, target_col=target)
    t1 = time.time()
    cpp_model.fit()
    t2 = time.time()
    cpp_time = t2 - t1
    cpp_preds = cpp_model.predict(X)

    # sklearn
    sk_model = LinearRegression()
    t3 = time.time()
    sk_model.fit(X, Y)
    t4 = time.time()
    sk_time = t4 - t3
    sk_preds = sk_model.predict(X)
    
    print("\n--- Coefficients ---")
    print("mlr_cpp:", cpp_model.get_coefficients().flatten())
    print("sklearn:", sk_model.coef_)

    print("\n--- Intercepts ---")
    print("mlr_cpp:", cpp_model.get_coefficients().flatten()[0])
    print("sklearn:", sk_model.intercept_)

    print("\n--- Runtimes ---")
    print(f"mlr_cpp: {cpp_time:.6f}s")
    print(f"sklearn: {sk_time:.6f}s")

    print("\n--- R² Scores ---")
    print(f"mlr_cpp: {r2_score(Y, cpp_preds):.6f}")
    print(f"sklearn: {r2_score(Y, sk_preds):.6f}")



In [104]:
benchmark_models(data)


--- Coefficients ---
mlr_cpp: [3.98827526e+00 3.98919944e+00 8.77169801e+00 3.74320006e+00
 3.38302087e-04 1.10138110e-02]
sklearn: [3.98919944e+00 8.77169801e+00 3.74320006e+00 3.38302087e-04
 1.10138110e-02]

--- Intercepts ---
mlr_cpp: 3.9882752613173915
sklearn: 3.988275261317881

--- Runtimes ---
mlr_cpp: 0.000344s
sklearn: 0.004404s

--- R² Scores ---
mlr_cpp: 0.987661
sklearn: 0.987661
