# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import os
import glob
import matplotlib.pyplot as plt
import math
import cv2


%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

# Importing Data

In [3]:
complete_data = pd.read_csv("right_nail_data_final.csv")

In [4]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 17 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   number   287 non-null    int64  
 1   mean_r   287 non-null    float64
 2   mean_g   287 non-null    float64
 3   mean_b   287 non-null    float64
 4   mean_rg  287 non-null    float64
 5   HHR      287 non-null    float64
 6   Ent      287 non-null    float64
 7   B        287 non-null    float64
 8   G1       287 non-null    float64
 9   G2       287 non-null    float64
 10  G3       287 non-null    float64
 11  G4       287 non-null    float64
 12  G5       287 non-null    float64
 13  age      287 non-null    int64  
 14  gender   287 non-null    int64  
 15  label    287 non-null    float64
 16  label_2  287 non-null    int64  
dtypes: float64(13), int64(4)
memory usage: 38.2 KB


In [5]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(complete_data, test_size=0.3)

In [6]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label','label_2'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label','label_2'])
y_test = test_data['label']

# Algorithms

In [7]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='poly'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [8]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Training and Testing

In [9]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 847
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 14
[LightGBM] [Info] Start training from score 11.761000
Learning rate set to 0.03175
0:	learn: 2.0161783	total: 134ms	remaining: 2m 14s
1:	learn: 1.9960900	total: 141ms	remaining: 1m 10s
2:	learn: 1.9817393	total: 150ms	remaining: 49.7s
3:	learn: 1.9632867	total: 157ms	remaining: 39.2s
4:	learn: 1.9448039	total: 167ms	remaining: 33.3s
5:	learn: 1.9271535	total: 180ms	remaining: 29.9s
6:	learn: 1.9085551	total: 189ms	remaining: 26.9s
7:	learn: 1.8971011	total: 192ms	remaining: 23.8s
8:	learn: 1.8797524	total: 203ms	remaining: 22.4s
9:	learn: 1.8693058	total: 204ms	remaining: 20.2s
10:	learn: 1.8546442	total: 215ms	remaining: 19.3s
11:	learn: 1.8391836	total: 222ms	remaining: 18.3s
12:	learn: 1.8240800	total:

# Results

In [10]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                             MAE  R-squared           MSE  Durbin-Watson  Jarque-Bera    JB P-value
Linear Regression       1.309279   0.313610  2.850082e+00       2.041581    72.804713  1.551163e-16
SVM Regression          1.515881   0.093298  3.764878e+00       2.002492     6.745567  3.429405e-02
RandomForest            0.515685   0.896443  4.299969e-01       2.135658    22.196131  1.514159e-05
Gradient Boost          0.422595   0.932333  2.809741e-01       2.150800     0.935355  6.264554e-01
knn                     1.482300   0.215659  3.256802e+00       1.960343     1.183720  5.532971e-01
LGBM                    0.564759   0.867365  5.507359e-01       1.944754   127.198732  2.394122e-28
CatBoost                0.079831   0.997664  9.698899e-03       2.071505     0.075925  9.627491e-01
Kernel Ridge Regressor  1.467942   0.143741  3.555424e+00       2.041785    10.088228  6.447170e-03
Elastic Net             1.604891   0.080013  3.820041e+00       2.024803     

# To Classification

--> <10.5 --> Anemic

--> >=10.5 ---> Non-Anemic 

In [32]:
algorithm = CatBoostRegressor()

In [None]:
# Train model
algorithm.fit(X_train, y_train)

# Test predictions
y_test_pred = algorithm.predict(X_test)

In [34]:
actual_labels = (y_test<10.5).tolist()
pred_labels = (pd.Series(y_test_pred)<10.5).tolist()

In [None]:
TP=TN=FN=FP = 0
for i in range(len(actual_labels)):
    if(actual_labels[i]==True and pred_labels[i]==True):
        TP +=1
    if(actual_labels[i]==False and pred_labels[i]==False):
        TN +=1
    if(actual_labels[i]==True and pred_labels[i]==False):
        FN +=1
    if(actual_labels[i]==False and pred_labels[i]==True):
        FP +=1

print(f"Accuracy = {(TP+TN)/(TP+TN+FP+FN)}")
print(f"Precision = {(TP)/(TP+FP)}")
print(f"Sensitivity = {(TP)/(TP+FN)}")
print(f"Specificity = {(TN)/(TN+FP)}")

# LEAVE ONE OUT

In [11]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

In [12]:
cv = LeaveOneOut()

In [13]:
X = complete_data.drop(columns=['number','label','label_2'])
y = complete_data['label']

## XGB Regression

In [14]:
model = XGBRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [15]:
y_true = y
y_pred = y + scores

In [16]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.3698635991857442
RMSE 1.1704117220814836
R-squared 0.23126879137698608


## CatBoost Regression

In [29]:
model = CatBoostRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [30]:
y_true = y
y_pred = y + scores

In [31]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.2767278286589028
RMSE 1.1299238154224835
R-squared 0.3021297123141824


## RandomForest

In [26]:
model = RandomForestRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [27]:
y_true = y
y_pred = y + scores

In [28]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.2907177700348427
RMSE 1.1360976058573677
R-squared 0.2859332898567074


## LGBM

In [20]:
model = LGBMRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [21]:
y_true = y
y_pred = y + scores

In [22]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.30513419504137
RMSE 1.1424246999436636
R-squared 0.264286894343034


## Linear Regression

In [17]:
model = LinearRegression()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [18]:
y_true = y
y_pred = y + scores

In [19]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.3287530183300142
RMSE 1.1527154975665133
R-squared 0.24122648096185295


# PyCaret

In [23]:
import pycaret

In [24]:
from pycaret.regression import *
s = setup(complete_data, target='label', ignore_features=['number','label_2'], preprocess=False, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(287, 17)"
4,Transformed data shape,"(287, 15)"
5,Transformed train set shape,"(200, 15)"
6,Transformed test set shape,"(87, 15)"
7,Ignore features,2
8,Numeric features,14


In [25]:
best_r = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,1.298,2.7486,1.6222,0.2619,0.1376,0.1221,0.155
et,Extra Trees Regressor,1.3158,2.9238,1.6723,0.2146,0.141,0.1233,0.113
ridge,Ridge Regression,1.3289,3.0277,1.6963,0.1832,0.1435,0.1264,0.021
catboost,CatBoost Regressor,1.3303,2.9535,1.6895,0.2029,0.1425,0.1248,2.962
lr,Linear Regression,1.3306,3.0463,1.7004,0.1778,0.1438,0.1266,0.838
ada,AdaBoost Regressor,1.3532,2.9737,1.6872,0.2038,0.1435,0.1285,0.072
lightgbm,Light Gradient Boosting Machine,1.3593,3.1696,1.7494,0.1239,0.1487,0.1286,0.838
br,Bayesian Ridge,1.3654,3.0905,1.7231,0.1684,0.1452,0.1291,0.033
gbr,Gradient Boosting Regressor,1.384,3.1662,1.7454,0.1314,0.1478,0.1308,0.092
xgboost,Extreme Gradient Boosting,1.3905,3.1311,1.7423,0.1336,0.1465,0.1291,0.09
