# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import os
import glob
import matplotlib.pyplot as plt
import math
import cv2


%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

# Importing Data

In [3]:
complete_data = pd.read_csv("left_eye_data_final.csv")

In [4]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 17 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   number   73 non-null     int64  
 1   mean_r   73 non-null     float64
 2   mean_g   73 non-null     float64
 3   mean_b   73 non-null     float64
 4   mean_rg  73 non-null     float64
 5   HHR      73 non-null     float64
 6   Ent      73 non-null     float64
 7   B        73 non-null     float64
 8   G1       73 non-null     float64
 9   G2       73 non-null     float64
 10  G3       73 non-null     float64
 11  G4       73 non-null     float64
 12  G5       73 non-null     float64
 13  age      73 non-null     int64  
 14  gender   73 non-null     int64  
 15  label    73 non-null     float64
 16  label_2  73 non-null     int64  
dtypes: float64(13), int64(4)
memory usage: 9.8 KB


In [128]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(complete_data, test_size=0.3)

In [129]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label','label_2'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label','label_2'])
y_test = test_data['label']

# Algorithms

In [130]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [131]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Training and Testing

In [132]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 225
[LightGBM] [Info] Number of data points in the train set: 51, number of used features: 14
[LightGBM] [Info] Start training from score 11.400000
Learning rate set to 0.025584
0:	learn: 1.8985495	total: 1.25ms	remaining: 1.25s
1:	learn: 1.8836366	total: 2.7ms	remaining: 1.34s
2:	learn: 1.8686289	total: 4.36ms	remaining: 1.45s
3:	learn: 1.8537045	total: 5.63ms	remaining: 1.4s
4:	learn: 1.8416792	total: 7.32ms	remaining: 1.46s
5:	learn: 1.8286584	total: 8.98ms	remaining: 1.49s
6:	learn: 1.8134655	total: 10.6ms	remaining: 1.51s
7:	learn: 1.8002252	total: 12.4ms	remaining: 1.53s
8:	learn: 1.7883307	total: 14.1ms	remaining: 1.55s
9:	learn: 1.7768659	total: 16.4ms	remaining: 1.62s
10:	learn: 1.7653714	total: 18.1ms	remaining: 1.63s
11:	learn: 1.7544384	total: 19.8ms	remaining: 1.63s
12:	learn: 1.743021

# Results

In [133]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                             MAE  R-squared           MSE  Durbin-Watson  Jarque-Bera  JB P-value
Linear Regression       1.094329   0.466720  1.948668e+00       2.478027     3.128449    0.209250
SVM Regression          1.397268   0.086268  3.338885e+00       2.134515     1.170526    0.556959
RandomForest            0.520078   0.881595  4.326671e-01       2.423818     0.357010    0.836520
Gradient Boost          0.085272   0.996861  1.147055e-02       2.680790     0.256976    0.879424
knn                     1.249020   0.343187  2.400071e+00       2.301694     2.153477    0.340705
LGBM                    1.075410   0.492040  1.856145e+00       2.359217     1.475643    0.478154
CatBoost                0.018344   0.999855  5.291362e-04       2.280142     0.286125    0.866700
Kernel Ridge Regressor  1.349990   0.232274  2.805361e+00       2.121162     0.691722    0.707611
Elastic Net             1.367772   0.162824  3.059139e+00       2.246705     2.542104    0.2805

# To Classification

--> <10.5 --> Anemic

--> >=10.5 ---> Non-Anemic 

In [134]:
algorithm = GradientBoostingRegressor()

In [135]:
# Train model
algorithm.fit(X_train, y_train)

# Test predictions
y_test_pred = algorithm.predict(X_test)

In [136]:
actual_labels = (y_test<10.5).tolist()
pred_labels = (pd.Series(y_test_pred)<10.5).tolist()

In [None]:
TP=TN=FN=FP = 0
for i in range(len(actual_labels)):
    if(actual_labels[i]==True and pred_labels[i]==True):
        TP +=1
    if(actual_labels[i]==False and pred_labels[i]==False):
        TN +=1
    if(actual_labels[i]==True and pred_labels[i]==False):
        FN +=1
    if(actual_labels[i]==False and pred_labels[i]==True):
        FP +=1

print(f"Accuracy = {(TP+TN)/(TP+TN+FP+FN)}")
print(f"Precision = {(TP)/(TP+FP)}")
print(f"Sensitivity = {(TP)/(TP+FN)}")
print(f"Specificity = {(TN)/(TN+FP)}")

# LEAVE ONE OUT

In [33]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

In [34]:
cv = LeaveOneOut()

In [35]:
X = complete_data.drop(columns=['number','label','label_2'])
y = complete_data['label']

## XGB Regression

In [36]:
model = XGBRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [37]:
y_true = y
y_pred = y + scores

In [38]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.7054776100263203
RMSE 1.3059393592454132
R-squared -0.23398984280381185


## CatBoost Regression

In [60]:
model = CatBoostRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [61]:
y_true = y
y_pred = y + scores

In [62]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.256329765310565
RMSE 1.120861171292219
R-squared 0.2908871697166032


## RandomForest

In [57]:
model = RandomForestRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [58]:
y_true = y
y_pred = y + scores

In [59]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.2905616438356162
RMSE 1.136028892165871
R-squared 0.21226101897218674


## LGBM

In [44]:
model = LGBMRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [45]:
y_true = y
y_pred = y + scores

In [46]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.3045049104624071
RMSE 1.1421492505195663
R-squared 0.2525452351959194


In [47]:
actual_labels = (y_true<10.5).tolist()
pred_labels = (y_pred<10.5).tolist()

In [48]:
TP=TN=FN=FP = 0
for i in range(len(actual_labels)):
    if(actual_labels[i]==True and pred_labels[i]==True):
        TP +=1
    if(actual_labels[i]==False and pred_labels[i]==False):
        TN +=1
    if(actual_labels[i]==True and pred_labels[i]==False):
        FN +=1
    if(actual_labels[i]==False and pred_labels[i]==True):
        FP +=1

print(f"Accuracy = {(TP+TN)/(TP+TN+FP+FN)}")
print(f"Precision = {(TP)/(TP+FP)}")
print(f"Sensitivity = {(TP)/(TP+FN)}")
print(f"Specificity = {(TN)/(TN+FP)}")

Accuracy = 0.7671232876712328
Precision = 0.5526315789473685
Sensitivity = 1.0
Specificity = 0.6730769230769231


## Linear Regression

In [39]:
model = LinearRegression()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [40]:
y_true = y
y_pred = y + scores

In [41]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.2996084931052803
RMSE 1.1400037250400896
R-squared 0.20811772627392944


In [42]:
actual_labels = (y_true<10.5).tolist()
pred_labels = (y_pred<10.5).tolist()

In [43]:
TP=TN=FN=FP = 0
for i in range(len(actual_labels)):
    if(actual_labels[i]==True and pred_labels[i]==True):
        TP +=1
    if(actual_labels[i]==False and pred_labels[i]==False):
        TN +=1
    if(actual_labels[i]==True and pred_labels[i]==False):
        FN +=1
    if(actual_labels[i]==False and pred_labels[i]==True):
        FP +=1

print(f"Accuracy = {(TP+TN)/(TP+TN+FP+FN)}")
print(f"Precision = {(TP)/(TP+FP)}")
print(f"Sensitivity = {(TP)/(TP+FN)}")
print(f"Specificity = {(TN)/(TN+FP)}")

Accuracy = 0.8082191780821918
Precision = 0.6
Sensitivity = 1.0
Specificity = 0.7307692307692307


# PyCaret

In [49]:
import pycaret

In [50]:
from pycaret.regression import *
s = setup(complete_data, target='label', ignore_features=['number','label_2'], preprocess=False, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(73, 17)"
4,Transformed data shape,"(73, 15)"
5,Transformed train set shape,"(51, 15)"
6,Transformed test set shape,"(22, 15)"
7,Ignore features,2
8,Numeric features,14


In [51]:
best_r = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,1.2918,2.5258,1.5135,-0.8803,0.123,0.1164,0.04
ada,AdaBoost Regressor,1.4,3.0968,1.657,-1.3271,0.1362,0.1284,0.068
catboost,CatBoost Regressor,1.4062,2.9234,1.5994,-1.1086,0.13,0.1286,0.897
knn,K Neighbors Regressor,1.4257,3.2132,1.6604,-1.0313,0.1344,0.1308,0.022
dummy,Dummy Regressor,1.4605,3.4954,1.724,-0.8337,0.1381,0.1332,0.011
et,Extra Trees Regressor,1.4614,3.551,1.7949,-2.1082,0.1506,0.1344,0.09
rf,Random Forest Regressor,1.4733,3.3113,1.7184,-1.5471,0.1402,0.1351,0.093
br,Bayesian Ridge,1.5364,3.6764,1.805,-1.1871,0.1449,0.1395,0.015
en,Elastic Net,1.5394,3.8786,1.8736,-2.2763,0.1561,0.1397,0.013
lasso,Lasso Regression,1.5484,3.8435,1.8766,-2.0419,0.1547,0.1404,0.016


In [52]:
best_r

In [53]:
predict = predict_model(best_r)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,1.4202,3.3967,1.843,0.2251,0.1568,0.1374


In [54]:
predict.columns.values

array(['mean_r', 'mean_g', 'mean_b', 'mean_rg', 'HHR', 'Ent', 'B', 'G1',
       'G2', 'G3', 'G4', 'G5', 'age', 'gender', 'label',
       'prediction_label'], dtype=object)

In [55]:
actual_labels = (predict['label']<10.5).tolist()
pred_labels = (predict['prediction_label']<10.5).tolist()

In [56]:
TP=TN=FN=FP = 0
for i in range(len(actual_labels)):
    if(actual_labels[i]==True and pred_labels[i]==True):
        TP +=1
    if(actual_labels[i]==False and pred_labels[i]==False):
        TN +=1
    if(actual_labels[i]==True and pred_labels[i]==False):
        FN +=1
    if(actual_labels[i]==False and pred_labels[i]==True):
        FP +=1

print(f"Accuracy = {(TP+TN)/(TP+TN+FP+FN)}")
print(f"Precision = {(TP)/(TP+FP)}")
print(f"Sensitivity = {(TP)/(TP+FN)}")
print(f"Specificity = {(TN)/(TN+FP)}")

Accuracy = 0.5909090909090909
Precision = 0.3333333333333333
Sensitivity = 0.125
Specificity = 0.8571428571428571
