# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import os
import glob
import matplotlib.pyplot as plt
import math
import cv2


%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

# Importing Data

In [3]:
complete_data = pd.read_csv("tongue_data_new.csv")

In [4]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   number   298 non-null    int64  
 1   mean_r   298 non-null    float64
 2   mean_g   298 non-null    float64
 3   mean_b   298 non-null    float64
 4   mean_rg  298 non-null    float64
 5   HHR      298 non-null    float64
 6   Ent      298 non-null    float64
 7   B        298 non-null    float64
 8   G1       298 non-null    float64
 9   G2       298 non-null    float64
 10  G3       298 non-null    float64
 11  G4       298 non-null    float64
 12  G5       298 non-null    float64
 13  label    298 non-null    float64
dtypes: float64(13), int64(1)
memory usage: 32.7 KB


In [5]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(complete_data, test_size=0.3)

In [6]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label'])
y_test = test_data['label']

# Algorithms

In [7]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='poly'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [8]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Training and Testing

In [9]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 844
[LightGBM] [Info] Number of data points in the train set: 208, number of used features: 12
[LightGBM] [Info] Start training from score 11.690865
Learning rate set to 0.031947
0:	learn: 1.9842070	total: 164ms	remaining: 2m 43s
1:	learn: 1.9763012	total: 175ms	remaining: 1m 27s
2:	learn: 1.9698987	total: 183ms	remaining: 1m
3:	learn: 1.9616068	total: 192ms	remaining: 47.9s
4:	learn: 1.9540165	total: 205ms	remaining: 40.8s
5:	learn: 1.9480182	total: 219ms	remaining: 36.3s
6:	learn: 1.9382080	total: 226ms	remaining: 32s
7:	learn: 1.9294655	total: 235ms	remaining: 29.2s
8:	learn: 1.9234140	total: 243ms	remaining: 26.7s
9:	learn: 1.9159449	total: 254ms	remaining: 25.1s
10:	learn: 1.9102915	total: 263ms	remaining: 23.6s
11:	learn: 1.9026121	total: 271ms	remaining: 22.3s
12:	learn: 1.8950701	total: 285

# Results

In [10]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                             MAE  R-squared       MSE  Durbin-Watson  Jarque-Bera    JB P-value
Linear Regression       1.506193   0.108202  3.538434       2.033549     7.167147  2.777626e-02
SVM Regression          1.529904   0.046081  3.784917       2.044769     4.236744  1.202272e-01
RandomForest            0.608567   0.851657  0.588588       2.156451     0.004740  9.976328e-01
Gradient Boost          0.608737   0.855196  0.574547       2.332101     1.871232  3.923442e-01
knn                     1.350385   0.270322  2.895181       2.077270     1.584588  4.528049e-01
LGBM                    0.685960   0.803795  0.778494       2.110978    62.828695  2.274690e-14
CatBoost                0.141669   0.992484  0.029820       2.251112     0.594913  7.427050e-01
Kernel Ridge Regressor  1.562679  -0.002565  3.977929       1.954454     7.683716  2.145371e-02
Elastic Net             1.517683   0.092126  3.602221       2.038960     5.739347  5.671745e-02
Bayesian Ridge   

# LEAVE ONE OUT

In [11]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

In [12]:
cv = LeaveOneOut()

In [13]:
X = complete_data.drop(columns=['number','label'])
y = complete_data['label']

## XGB Regression

In [14]:
model = XGBRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [15]:
y_true = y
y_pred = y + scores

In [16]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.784505593856709
RMSE 1.3358538819259795
R-squared -0.3460846194275238


## CatBoost Regression

In [26]:
model = CatBoostRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [27]:
y_true = y
y_pred = y + scores

In [28]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.7255936086217087
RMSE 1.3136185171585046
R-squared -0.21717358028650313


## RandomForest

In [29]:
model = RandomForestRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [30]:
y_true = y
y_pred = y + scores

In [31]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.6387147651006706
RMSE 1.2801229492125632
R-squared -0.10268384541276654


## LGBM

In [20]:
model = LGBMRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [21]:
y_true = y
y_pred = y + scores

In [22]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.772154727248226
RMSE 1.3312230193503365
R-squared -0.2763449289924045


## Linear Regression

In [17]:
model = LinearRegression()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [18]:
y_true = y
y_pred = y + scores

In [19]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.588442186548473
RMSE 1.2603341567014967
R-squared -0.01566835007952805


# PyCaret

In [23]:
import pycaret

In [24]:
from pycaret.regression import *
s = setup(complete_data, target='label', ignore_features=['number'], preprocess=False, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(298, 14)"
4,Transformed data shape,"(298, 13)"
5,Transformed train set shape,"(208, 13)"
6,Transformed test set shape,"(90, 13)"
7,Ignore features,1
8,Numeric features,12


In [25]:
best_r = compare_models(sort = 'R2', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,1.5487,3.7691,1.9004,-0.0926,0.1611,0.1492,0.016
lasso,Lasso Regression,1.5529,3.7994,1.9068,-0.0978,0.1615,0.1496,0.015
llar,Lasso Least Angle Regression,1.5529,3.7994,1.9068,-0.0978,0.1615,0.1496,0.014
br,Bayesian Ridge,1.5674,3.8728,1.9222,-0.1118,0.1628,0.1511,0.017
dummy,Dummy Regressor,1.5613,3.8886,1.9263,-0.1134,0.163,0.1503,0.017
ridge,Ridge Regression,1.55,3.8122,1.9162,-0.1145,0.1626,0.1491,0.016
omp,Orthogonal Matching Pursuit,1.5647,3.8967,1.9266,-0.1154,0.1631,0.1508,0.017
ada,AdaBoost Regressor,1.6033,3.9726,1.946,-0.1418,0.1656,0.1563,0.059
lr,Linear Regression,1.5692,3.9139,1.9402,-0.1452,0.1655,0.1509,0.412
knn,K Neighbors Regressor,1.5918,4.1136,1.9859,-0.1994,0.1686,0.1523,0.018
