# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import os
import glob
import matplotlib.pyplot as plt
import math
import cv2


%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

# Importing Data

In [3]:
complete_data = pd.read_csv("left_palm_data.csv")

In [4]:
# from sklearn.model_selection import train_test_split

# train_data, _ = train_test_split(complete_data, test_size=0.1)

In [5]:
test_data = pd.read_csv("left_palm_cutout_data.csv")

In [6]:
train_data = complete_data[~complete_data["number"].isin(test_data["number"])]

In [7]:
list1 = set(train_data["number"].tolist())
list2 = set(test_data["number"].tolist())

if (list1 & list2):
    print(list1 & list2)
else:
    print("No common elements")

No common elements


In [8]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label'])
y_test = test_data['label']

# Algorithms

In [9]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='poly'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [10]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Training and Testing

In [11]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1089
[LightGBM] [Info] Number of data points in the train set: 270, number of used features: 12
[LightGBM] [Info] Start training from score 11.508889
Learning rate set to 0.033292
0:	learn: 1.9354017	total: 149ms	remaining: 2m 28s
1:	learn: 1.9295535	total: 154ms	remaining: 1m 16s
2:	learn: 1.9221771	total: 159ms	remaining: 52.7s
3:	learn: 1.9148027	total: 164ms	remaining: 40.7s
4:	learn: 1.9084378	total: 168ms	remaining: 33.5s
5:	learn: 1.9010845	total: 173ms	remaining: 28.7s
6:	learn: 1.8950915	total: 178ms	remaining: 25.2s
7:	learn: 1.8906525	total: 183ms	remaining: 22.7s
8:	learn: 1.8852365	total: 188ms	remaining: 20.7s
9:	learn: 1.8792939	total: 192ms	remaining: 19s
10:	learn: 1.8737280	total: 197ms	remaining: 17.7s
11:	learn: 1.8674430	total: 202ms	remaining: 16.7s
12:	learn: 1.8617140	total:

# Results

In [12]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                             MAE  R-squared           MSE  Durbin-Watson  Jarque-Bera    JB P-value
Linear Regression       1.482034   0.105275  3.375959e+00       1.881707     0.827768  6.610776e-01
SVM Regression          1.533037   0.023316  3.685203e+00       1.920632     3.255104  1.964098e-01
RandomForest            0.598622   0.852537  5.564045e-01       1.949008     1.113971  5.729336e-01
Gradient Boost          0.720515   0.784852  8.117904e-01       1.991758     3.695114  1.576218e-01
knn                     1.289852   0.255300  2.809889e+00       2.011932    36.453840  1.213806e-08
LGBM                    0.591531   0.839648  6.050355e-01       1.788489    58.347397  2.138079e-13
CatBoost                0.177409   0.986821  4.972674e-02       1.797522     3.414648  1.813505e-01
Kernel Ridge Regressor  1.524615   0.050115  3.584089e+00       1.865282     0.462932  7.933696e-01
Elastic Net             1.503089   0.084091  3.455890e+00       1.882476     

# LEAVE ONE OUT

In [47]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

In [48]:
cv = LeaveOneOut()

In [49]:
X = test_data.drop(columns=['number','label'])
y = test_data['label']

## XGB Regression

In [50]:
model = XGBRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [51]:
y_true = y
y_pred = y + scores

In [None]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

## CatBoost Regression

In [None]:
model = CatBoostRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [None]:
y_true = y
y_pred = y + scores

In [None]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

## RandomForest

In [None]:
model = RandomForestRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [None]:
y_true = y
y_pred = y + scores

In [None]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

## LGBM

In [56]:
model = LGBMRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [57]:
y_true = y
y_pred = y + scores

In [None]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

## Linear Regression

In [53]:
model = LinearRegression()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [54]:
y_true = y
y_pred = y + scores

In [None]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

# PyCaret

In [13]:
import pycaret

In [14]:
from pycaret.regression import *
s = setup(train_data, target='label', ignore_features=['number'], test_data=test_data, preprocess=False, index=False, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(285, 14)"
4,Transformed data shape,"(285, 13)"
5,Transformed train set shape,"(270, 13)"
6,Transformed test set shape,"(15, 13)"
7,Ignore features,1
8,Numeric features,12


In [15]:
best_r = compare_models(sort = 'R2', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,1.5379,3.6241,1.8894,-0.0192,0.1581,0.1437,0.015
lasso,Lasso Regression,1.5424,3.662,1.8976,-0.0219,0.1589,0.1442,0.016
llar,Lasso Least Angle Regression,1.5424,3.662,1.8976,-0.0219,0.1589,0.1442,0.015
br,Bayesian Ridge,1.5527,3.7085,1.9083,-0.0364,0.1596,0.1453,0.015
omp,Orthogonal Matching Pursuit,1.5562,3.7542,1.9181,-0.039,0.1604,0.1456,0.014
dummy,Dummy Regressor,1.5708,3.8031,1.9327,-0.0529,0.1617,0.1468,0.012
ridge,Ridge Regression,1.5597,3.7513,1.9269,-0.0814,0.1618,0.1454,0.015
huber,Huber Regressor,1.5936,3.9047,1.9628,-0.108,0.1635,0.1471,0.025
lr,Linear Regression,1.5815,3.8639,1.9563,-0.1158,0.1642,0.1471,1.269
ada,AdaBoost Regressor,1.6006,3.9747,1.9793,-0.139,0.1661,0.1515,0.055
