# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import os
import glob
import matplotlib.pyplot as plt
import math
import cv2


%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

# Importing Data

In [3]:
complete_data = pd.read_excel("phase1_data_with_AgeGender.xlsx",0)

In [4]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   number   95 non-null     int64  
 1   mean_r   95 non-null     float64
 2   mean_g   95 non-null     float64
 3   mean_b   95 non-null     float64
 4   mean_rg  95 non-null     float64
 5   HHR      95 non-null     float64
 6   Ent      95 non-null     float64
 7   B        95 non-null     float64
 8   G1       95 non-null     float64
 9   G2       95 non-null     float64
 10  G3       95 non-null     float64
 11  G4       95 non-null     float64
 12  G5       95 non-null     float64
 13  gender   95 non-null     int64  
 14  age      95 non-null     int64  
 15  label    95 non-null     float64
dtypes: float64(13), int64(3)
memory usage: 12.0 KB


In [5]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(complete_data, test_size=0.3)

In [6]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label'])
y_test = test_data['label']

# Algorithms

In [7]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='poly'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [8]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Training and Testing

In [9]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 293
[LightGBM] [Info] Number of data points in the train set: 66, number of used features: 14
[LightGBM] [Info] Start training from score 11.562121
Learning rate set to 0.026648
0:	learn: 1.9031076	total: 103ms	remaining: 1m 43s
1:	learn: 1.8860192	total: 106ms	remaining: 52.8s
2:	learn: 1.8690448	total: 108ms	remaining: 35.9s
3:	learn: 1.8549770	total: 109ms	remaining: 27.2s
4:	learn: 1.8370942	total: 111ms	remaining: 22.1s
5:	learn: 1.8193806	total: 112ms	remaining: 18.6s
6:	learn: 1.7991468	total: 114ms	remaining: 16.1s
7:	learn: 1.7845196	total: 116ms	remaining: 14.3s
8:	learn: 1.7669294	total: 117ms	remaining: 12.9s
9:	learn: 1.7494948	total: 119ms	remaining: 11.8s
10:	learn: 1.7342712	total: 121ms	remaining: 10.9s
11:	learn: 1.72

# Results

In [10]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                             MAE  R-squared           MSE  Durbin-Watson  Jarque-Bera    JB P-value
Linear Regression       0.909753   0.662183  1.249590e+00       2.260567     2.060537  3.569110e-01
SVM Regression          1.501809   0.011193  3.657617e+00       2.182071     0.455974  7.961348e-01
RandomForest            0.410606   0.929395  2.611692e-01       2.235866     1.474358  4.784616e-01
Gradient Boost          0.055837   0.998655  4.975994e-03       1.836202     1.098264  5.774507e-01
knn                     1.422424   0.244822  2.793418e+00       2.373471     2.768109  2.505606e-01
LGBM                    0.795414   0.758534  8.931876e-01       2.271688     3.960313  1.380476e-01
CatBoost                0.009427   0.999960  1.476228e-04       2.242454     0.560323  7.556618e-01
Kernel Ridge Regressor  0.912953   0.650521  1.292729e+00       2.191991     1.473842  4.785853e-01
Elastic Net             1.145498   0.461520  1.991847e+00       2.016095     

# To Classification

--> <10.5 --> Anemic

--> >=10.5 ---> Non-Anemic 

In [11]:
algorithm = CatBoostRegressor()

In [12]:
# Train model
algorithm.fit(X_train, y_train)

# Test predictions
y_test_pred = algorithm.predict(X_test)

Learning rate set to 0.026648
0:	learn: 1.9031076	total: 881us	remaining: 881ms
1:	learn: 1.8860192	total: 2.44ms	remaining: 1.22s
2:	learn: 1.8690448	total: 4.61ms	remaining: 1.53s
3:	learn: 1.8549770	total: 5.78ms	remaining: 1.44s
4:	learn: 1.8370942	total: 7.73ms	remaining: 1.54s
5:	learn: 1.8193806	total: 8.38ms	remaining: 1.39s
6:	learn: 1.7991468	total: 9.82ms	remaining: 1.39s
7:	learn: 1.7845196	total: 11.7ms	remaining: 1.45s
8:	learn: 1.7669294	total: 13.6ms	remaining: 1.49s
9:	learn: 1.7494948	total: 15.9ms	remaining: 1.58s
10:	learn: 1.7342712	total: 17.8ms	remaining: 1.6s
11:	learn: 1.7207773	total: 19.8ms	remaining: 1.63s
12:	learn: 1.7050406	total: 21.7ms	remaining: 1.64s
13:	learn: 1.6850778	total: 23ms	remaining: 1.62s
14:	learn: 1.6687475	total: 24.8ms	remaining: 1.63s
15:	learn: 1.6510773	total: 25.7ms	remaining: 1.58s
16:	learn: 1.6392048	total: 27.4ms	remaining: 1.59s
17:	learn: 1.6272184	total: 29.6ms	remaining: 1.62s
18:	learn: 1.6131555	total: 31.6ms	remaining: 1.

In [13]:
y_test_pred

array([11.77230965, 10.0735224 , 13.26694197, 11.70058297,  9.94529777,
       13.23613826, 12.1648111 , 12.76443877, 10.24526854, 11.96044141,
       12.72074858, 10.90463441, 11.04257926, 12.67272851, 11.09733359,
       12.77289353, 11.94898254, 12.83933748, 11.32610711, 12.69291221,
       13.24277032, 10.95561785, 11.66633462, 11.30432168, 10.18986574,
       11.10399759,  9.945453  , 11.35827196, 11.66621464])

In [14]:
actual_labels = (y_test<10.5).tolist()
pred_labels = (pd.Series(y_test_pred)<10.5).tolist()

In [15]:
TP=TN=FN=FP = 0
for i in range(len(actual_labels)):
    if(actual_labels[i]==True and pred_labels[i]==True):
        TP +=1
    if(actual_labels[i]==False and pred_labels[i]==False):
        TN +=1
    if(actual_labels[i]==True and pred_labels[i]==False):
        FN +=1
    if(actual_labels[i]==False and pred_labels[i]==True):
        FP +=1

print(f"Accuracy = {(TP+TN)/(TP+TN+FP+FN)}")
print(f"Precision = {(TP)/(TP+FP)}")
print(f"Sensitivity = {(TP)/(TP+FN)}")
print(f"Specificity = {(TN)/(TN+FP)}")

Accuracy = 0.7586206896551724
Precision = 1.0
Sensitivity = 0.4166666666666667
Specificity = 1.0


# LEAVE ONE OUT

In [7]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

In [8]:
cv = LeaveOneOut()

In [9]:
X = complete_data.drop(columns=['number','label'])
y = complete_data['label']

## XGB Regression

In [14]:
model = XGBRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [15]:
y_true = y
y_pred = y + scores

In [16]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.464939956665039
RMSE 1.2103470397638187
R-squared 0.24794043849618141


## CatBoost Regression

In [29]:
model = CatBoostRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [30]:
y_true = y
y_pred = y + scores

In [31]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.2810226991910867
RMSE 1.1318227331128698
R-squared 0.42524365600251957


## RandomForest

In [26]:
model = RandomForestRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [27]:
y_true = y
y_pred = y + scores

In [28]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.3148842105263154
RMSE 1.146684006396843
R-squared 0.40392435637655866


## LGBM

In [20]:
model = LGBMRegressor()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [21]:
y_true = y
y_pred = y + scores

In [22]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.3488156290562205
RMSE 1.1613852199232693
R-squared 0.36334490898862537


## Linear Regression

In [10]:
model = LinearRegression()

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

In [11]:
y_true = y
y_pred = y + scores

In [12]:
print(f"MAE {mean(absolute(scores))}")
print(f"RMSE {sqrt(mean(absolute(scores)))}")
print(f"R-squared {r2_score(y_true,y_pred)}")

MAE 1.2493069861138308
RMSE 1.1177240205497199
R-squared 0.4017154080044516


In [21]:
df = pd.DataFrame([y_true,y_pred],index=["y_true","y_pred"])

In [23]:
df.to_csv('linear_regression_95.csv',index = False)

# PyCaret

In [23]:
import pycaret

In [24]:
from pycaret.regression import *
s = setup(complete_data, target='label', ignore_features=['number'], preprocess=False, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(95, 16)"
4,Transformed data shape,"(95, 15)"
5,Transformed train set shape,"(66, 15)"
6,Transformed test set shape,"(29, 15)"
7,Ignore features,1
8,Numeric features,14


In [25]:
best_r = compare_models(sort = 'R2', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,1.1721,1.9673,1.347,0.3585,0.1097,0.1057,0.011
llar,Lasso Least Angle Regression,1.1722,1.9675,1.347,0.3585,0.1097,0.1057,0.013
en,Elastic Net,1.1548,1.9009,1.3191,0.3579,0.1074,0.104,0.018
br,Bayesian Ridge,1.163,1.9443,1.3349,0.3405,0.1088,0.1048,0.014
lr,Linear Regression,1.1935,2.0004,1.355,0.2751,0.1103,0.1077,0.337
huber,Huber Regressor,1.257,2.2072,1.4417,0.2297,0.1159,0.1112,0.022
ridge,Ridge Regression,1.19,2.0325,1.3596,0.2074,0.11,0.1067,0.013
et,Extra Trees Regressor,1.2487,2.2905,1.4665,0.194,0.1232,0.115,0.074
catboost,CatBoost Regressor,1.2818,2.4555,1.5185,0.1837,0.1262,0.1182,0.861
rf,Random Forest Regressor,1.2686,2.3471,1.4733,0.1764,0.1221,0.1171,0.095
