# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import os
import glob
import matplotlib.pyplot as plt
import math
import cv2


%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

# Importing Data

In [3]:
complete_data = pd.read_csv("right_nail_data.csv")

In [4]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(complete_data, test_size=0.3)

In [5]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label'])
y_test = test_data['label']

# Algorithms

In [6]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='poly'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [7]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Training and Testing

In [8]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 825
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 12
[LightGBM] [Info] Start training from score 11.659524
Learning rate set to 0.031996
0:	learn: 1.9327418	total: 137ms	remaining: 2m 17s
1:	learn: 1.9251705	total: 144ms	remaining: 1m 11s
2:	learn: 1.9174860	total: 150ms	remaining: 49.7s
3:	learn: 1.9126919	total: 155ms	remaining: 38.5s
4:	learn: 1.9041585	total: 162ms	remaining: 32.3s
5:	learn: 1.8996892	total: 168ms	remaining: 27.8s
6:	learn: 1.8947983	total: 173ms	remaining: 24.6s
7:	learn: 1.8878394	total: 179ms	remaining: 22.2s
8:	learn: 1.8820894	total: 184ms	remaining: 20.3s
9:	learn: 1.8749274	total: 189ms	remaining: 18.7s
10:	learn: 1.8692634	total: 193ms	remaining: 17.4s
11:	learn: 1.

# Results

In [9]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                             MAE  R-squared       MSE  Durbin-Watson  Jarque-Bera    JB P-value
Linear Regression       1.516980   0.086608  3.433686       1.948809     0.274879  8.715870e-01
SVM Regression          1.550516   0.030914  3.643052       1.959585     0.528145  7.679178e-01
RandomForest            0.610457   0.838171  0.608360       1.932304     0.305284  8.584368e-01
Gradient Boost          0.574206   0.860754  0.523462       2.053880     1.584054  4.529258e-01
knn                     1.382571   0.222936  2.921190       1.872536     0.222578  8.946802e-01
LGBM                    0.651437   0.817441  0.686287       1.897582     0.699361  7.049133e-01
CatBoost                0.162256   0.989141  0.040821       2.013639     1.852478  3.960404e-01
Kernel Ridge Regressor  1.653301  -0.077497  4.050600       2.022329     1.220977  5.430854e-01
Elastic Net             1.563162   0.044507  3.591951       1.987414     0.828301  6.609016e-01
Bayesian Ridge   

# LEAVE ONE OUT

In [10]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

In [11]:
cv = LeaveOneOut()

In [12]:
X = complete_data.drop(columns=['number','label'])
y = complete_data['label']

## XGB Regression

In [13]:
model = XGBRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [14]:
y_true = y
y_pred = y + scores

In [15]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.7466036052703855
RMSE 1.3215913155247296
R-squared -0.21144345503621031


## CatBoost Regression

In [28]:
model = CatBoostRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

KeyboardInterrupt: 

In [None]:
y_true = y
y_pred = y + scores

In [None]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

## RandomForest

In [None]:
model = RandomForestRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [None]:
y_true = y
y_pred = y + scores

In [None]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

## LGBM

In [25]:
model = LGBMRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [26]:
y_true = y
y_pred = y + scores

In [27]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.661578860165762
RMSE 1.289022443623757
R-squared -0.18342093152764272


## Linear Regression

In [22]:
model = LinearRegression()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [23]:
y_true = y
y_pred = y + scores

In [24]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.5920758205832493
RMSE 1.2617748692152848
R-squared 0.019444425902074625


# PyCaret

In [19]:
import pycaret

In [20]:
from pycaret.regression import *
s = setup(complete_data, target='label', ignore_features=['number'], preprocess=False, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(300, 14)"
4,Transformed data shape,"(300, 13)"
5,Transformed train set shape,"(210, 13)"
6,Transformed test set shape,"(90, 13)"
7,Ignore features,1
8,Numeric features,12


In [21]:
best_r = compare_models(sort = 'R2', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,1.6093,3.9879,1.9576,-0.0744,0.1652,0.1538,0.021
lasso,Lasso Regression,1.6087,4.0321,1.9667,-0.0823,0.1659,0.1539,0.022
llar,Lasso Least Angle Regression,1.6087,4.0321,1.9667,-0.0823,0.1659,0.1539,0.02
lr,Linear Regression,1.6109,3.9812,1.9587,-0.0879,0.1653,0.1535,2.774
omp,Orthogonal Matching Pursuit,1.6141,4.1196,1.986,-0.0969,0.1674,0.1547,0.022
br,Bayesian Ridge,1.6225,4.121,1.9862,-0.0988,0.1674,0.1553,0.022
ridge,Ridge Regression,1.6186,4.0152,1.9716,-0.1032,0.1662,0.1541,0.02
dummy,Dummy Regressor,1.6132,4.1953,2.0055,-0.1226,0.1688,0.1547,0.019
ada,AdaBoost Regressor,1.6657,4.4421,2.0612,-0.1987,0.1741,0.1601,0.078
huber,Huber Regressor,1.6835,4.3058,2.047,-0.2033,0.1712,0.1581,0.034
