# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import os
import glob
import matplotlib.pyplot as plt
import math
import cv2


%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

# Importing Data

In [3]:
complete_data = pd.read_csv("right_nail_data_new.csv")

In [4]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   number   287 non-null    int64  
 1   mean_r   287 non-null    float64
 2   mean_g   287 non-null    float64
 3   mean_b   287 non-null    float64
 4   mean_rg  287 non-null    float64
 5   HHR      287 non-null    float64
 6   Ent      287 non-null    float64
 7   B        287 non-null    float64
 8   G1       287 non-null    float64
 9   G2       287 non-null    float64
 10  G3       287 non-null    float64
 11  G4       287 non-null    float64
 12  G5       287 non-null    float64
 13  label    287 non-null    float64
dtypes: float64(13), int64(1)
memory usage: 31.5 KB


In [5]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(complete_data, test_size=0.3)

In [6]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label'])
y_test = test_data['label']

# Algorithms

In [7]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='poly'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [8]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Training and Testing

In [9]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000861 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 802
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 12
[LightGBM] [Info] Start training from score 11.623000
Learning rate set to 0.03175
0:	learn: 1.9932998	total: 150ms	remaining: 2m 30s
1:	learn: 1.9849732	total: 154ms	remaining: 1m 17s
2:	learn: 1.9772435	total: 158ms	remaining: 52.5s
3:	learn: 1.9701794	total: 163ms	remaining: 40.5s
4:	learn: 1.9609944	total: 167ms	remaining: 33.2s
5:	learn: 1.9533093	total: 172ms	remaining: 28.4s
6:	learn: 1.9476612	total: 176ms	remaining: 24.9s
7:	learn: 1.9419004	total: 179ms	remaining: 22.2s
8:	learn: 1.9332993	total: 183ms	remaining: 20.1s
9:	learn: 1.9241047	total: 187ms	remaining: 18.6s
10:	learn: 1.9145635	total: 192ms	remaining: 17.3s
11:	learn: 1.9

# Results

In [10]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                             MAE  R-squared           MSE  Durbin-Watson  Jarque-Bera    JB P-value
Linear Regression       1.534366   0.085091  3.668026e+00       2.294034     3.984512  1.363874e-01
SVM Regression          1.502027   0.078941  3.692683e+00       2.296930     8.506589  1.421732e-02
RandomForest            0.609265   0.855810  5.780817e-01       2.171962     4.799099  9.075885e-02
Gradient Boost          0.513475   0.897246  4.119599e-01       2.152871     0.523454  7.697211e-01
knn                     1.338500   0.295362  2.825014e+00       1.970789     1.317767  5.174288e-01
LGBM                    0.682455   0.796133  8.173376e-01       2.114353    29.787529  3.401891e-07
CatBoost                0.135971   0.993309  2.682434e-02       2.034615     0.105040  9.488355e-01
Kernel Ridge Regressor  1.581657   0.030114  3.888439e+00       2.217633     1.264072  5.315086e-01
Elastic Net             1.559371   0.064539  3.750424e+00       2.281910     

# LEAVE ONE OUT

In [11]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

In [12]:
cv = LeaveOneOut()

In [13]:
X = complete_data.drop(columns=['number','label'])
y = complete_data['label']

## XGB Regression

In [14]:
model = XGBRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [15]:
y_true = y
y_pred = y + scores

In [16]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.6369827124299905
RMSE 1.2794462522630603
R-squared -0.10878761321151642


## CatBoost Regression

In [26]:
model = CatBoostRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [27]:
y_true = y
y_pred = y + scores

In [28]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.5725013995644808
RMSE 1.253994178441224
R-squared -0.004856923073837782


## RandomForest

In [29]:
model = RandomForestRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [30]:
y_true = y
y_pred = y + scores

In [31]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.5842787456445997
RMSE 1.2586813519094495
R-squared 0.002247044790696817


## LGBM

In [20]:
model = LGBMRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [21]:
y_true = y
y_pred = y + scores

In [22]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.6379923468320652
RMSE 1.2798407505748772
R-squared -0.037078222019918394


## Linear Regression

In [17]:
model = LinearRegression()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [18]:
y_true = y
y_pred = y + scores

In [19]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.5719599645996793
RMSE 1.253778275692987
R-squared 0.020773056665322698


# PyCaret

In [23]:
import pycaret

In [24]:
from pycaret.regression import *
s = setup(complete_data, target='label', ignore_features=['number'], preprocess=False, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(287, 14)"
4,Transformed data shape,"(287, 13)"
5,Transformed train set shape,"(200, 13)"
6,Transformed test set shape,"(87, 13)"
7,Ignore features,1
8,Numeric features,12


In [25]:
best_r = compare_models(sort = 'R2', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,1.58,3.8204,1.9368,-0.0525,0.1608,0.1465,0.017
lasso,Lasso Regression,1.5911,3.882,1.9536,-0.0719,0.1622,0.1477,0.013
llar,Lasso Least Angle Regression,1.5911,3.882,1.9536,-0.0719,0.1622,0.1477,0.018
ridge,Ridge Regression,1.5883,3.9102,1.9569,-0.0724,0.1623,0.1472,0.015
lr,Linear Regression,1.5884,3.9313,1.9628,-0.0793,0.1627,0.1472,0.418
dummy,Dummy Regressor,1.6078,3.9299,1.9673,-0.088,0.1633,0.1495,0.02
br,Bayesian Ridge,1.6043,3.9318,1.9677,-0.0891,0.1634,0.1491,0.017
omp,Orthogonal Matching Pursuit,1.6049,3.9403,1.9698,-0.0924,0.1637,0.1492,0.016
et,Extra Trees Regressor,1.6402,4.1108,2.0039,-0.1267,0.1656,0.1507,0.102
rf,Random Forest Regressor,1.6358,4.1498,2.016,-0.1387,0.1671,0.1514,0.12
