In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fingernail-all/FingerNail_Features.xlsx
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709974143530.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1710051134288.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709800917321.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709976901316.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709723206678.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709795312517.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709720647227.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1710059890678.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709636926027.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1710059288351.jpg
/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked/1709985772004.jpg
/kaggle/input/fingernail-all/Fing

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [3]:
# Read Excel file and select needed columns
df = pd.read_excel('/kaggle/input/fingernail-all/FingerNail_Features.xlsx')
df = df[['Image_Name', 'Hb Value']]

In [4]:
# Split data into train (approx. 70%), validation (approx. 15%), and test (15%)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.176, random_state=42)

In [5]:
# Function to load and preprocess images
def load_image_and_label(image_file, label):
    image = tf.io.read_file(image_file)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])  # Resize for ResNet50
    image = image / 255.0  # Normalize to [0, 1]
    return image, label

# Function to create TensorFlow datasets
def create_tf_dataset(df, images_folder, batch_size=16, shuffle=True):
    image_paths = [os.path.join(images_folder, img) for img in df['Image_Name'].values]
    labels = df['Hb Value'].values.astype(np.float32)
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(load_image_and_label, num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Define your image folder path and create datasets
images_folder = "/kaggle/input/fingernail-all/Fingernail Masked/Fingernail Masked"
train_ds = create_tf_dataset(train_df, images_folder, batch_size=16, shuffle=True)
val_ds   = create_tf_dataset(val_df,   images_folder, batch_size=16, shuffle=False)
test_ds  = create_tf_dataset(test_df,  images_folder, batch_size=16, shuffle=False)

In [6]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load ResNet50 base (without top) with pretrained ImageNet weights
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base model layers initially
for layer in base_model.layers:
    layer.trainable = False

# Build the regression head on top of the base model
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu', name='feature_dense')(x)  # Named layer for later feature extraction
x = Dropout(0.5)(x)
predictions = Dense(1, activation='linear')(x)  # Linear activation for regression

# Construct the full model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss='mean_absolute_error', metrics=['mae'])

# Train the new head (freeze base model)
history = model.fit(train_ds, validation_data=val_ds, epochs=50)


Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 511ms/step - loss: 8.2822 - mae: 8.2822 - val_loss: 2.0270 - val_mae: 2.0270
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 2.1869 - mae: 2.1869 - val_loss: 2.0367 - val_mae: 2.0367
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 1.7122 - mae: 1.7122 - val_loss: 1.5336 - val_mae: 1.5336
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 1.7640 - mae: 1.7640 - val_loss: 1.6118 - val_mae: 1.6118
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 1.7965 - mae: 1.7965 - val_loss: 1.6003 - val_mae: 1.6003
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 1.7265 - mae: 1.7265 - val_loss: 1.5829 - val_mae: 1.5829
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss:

In [7]:
# Unfreeze the last 20 layers of the base model for fine tuning
for layer in base_model.layers[-20:]:
    layer.trainable = True

# Re-compile the model with a lower learning rate for fine tuning
model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_absolute_error', metrics=['mae'])

# Fine tune the model further
history_ft = model.fit(train_ds, validation_data=val_ds, epochs=100)

# Optionally, evaluate the fine tuned model on the test dataset
test_mae = model.evaluate(test_ds)
print("Test MAE after fine tuning:", test_mae)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 561ms/step - loss: 2.0972 - mae: 2.0972 - val_loss: 1.8238 - val_mae: 1.8238
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 1.7447 - mae: 1.7447 - val_loss: 1.8411 - val_mae: 1.8411
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 1.5854 - mae: 1.5854 - val_loss: 1.6909 - val_mae: 1.6909
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 1.4840 - mae: 1.4840 - val_loss: 1.5108 - val_mae: 1.5108
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 1.5021 - mae: 1.5021 - val_loss: 1.5910 - val_mae: 1.5910
Epoch 6/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 1.3613 - mae: 1.3613 - val_loss: 1.9844 - val_mae: 1.9844
Epoch 7/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step 

In [8]:
# Create a feature extractor model that outputs the activations of the "feature_dense" layer
feature_extractor = Model(inputs=model.input, outputs=model.get_layer("feature_dense").output)

# Define a helper function to extract features from a TF dataset
def extract_features(dataset, extractor_model):
    features_list = []
    labels_list = []
    for batch_images, batch_labels in dataset:
        features = extractor_model.predict(batch_images)
        features_list.append(features)
        labels_list.append(batch_labels.numpy())
    features_array = np.concatenate(features_list, axis=0)
    labels_array = np.concatenate(labels_list, axis=0)
    return features_array, labels_array

# Extract features and labels from each dataset
train_features, train_labels = extract_features(train_ds, feature_extractor)
val_features, val_labels     = extract_features(val_ds, feature_extractor)
test_features, test_labels   = extract_features(test_ds, feature_extractor)

print("Train features shape:", train_features.shape)
print("Validation features shape:", val_features.shape)
print("Test features shape:", test_features.shape)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms

In [9]:
!pip install pymrmr



In [10]:
import pandas as pd
import numpy as np
import pymrmr

# Assume 'train_features' and 'train_labels' are from the feature extractor
# Create a DataFrame with columns f0, f1, ..., and the target "Hb Value"
train_feat_df = pd.DataFrame(train_features, columns=[f"f{i}" for i in range(train_features.shape[1])])
train_feat_df['Hb Value'] = train_labels

# Discretize each feature column into 10 quantile bins.
for col in train_feat_df.columns[:-1]:  # Exclude the target column
    # qcut will bin the continuous values into 10 categories.
    train_feat_df[col] = pd.qcut(train_feat_df[col], q=10, duplicates='drop').cat.codes

# Discretize the target as well.
train_feat_df['Hb Value'] = pd.qcut(train_feat_df['Hb Value'], q=10, duplicates='drop').cat.codes


In [11]:
# Define the number of top features you wish to select
num_top_features = 100

# Run mRMR to select features
selected_features = pymrmr.mRMR(train_feat_df, 'MIQ', num_top_features)
print("Selected features:", selected_features)

# Convert feature names (e.g., "f23") to column indices
selected_indices = []

for feat in selected_features:
    # Ensure the feature name starts with 'f' and is followed by digits
    if feat.startswith('f') and feat[1:].isdigit():
        selected_indices.append(int(feat[1:]))
    else:
        print(f"Skipping non-standard feature: {feat}")

print("Selected feature indices:", selected_indices)


Selected features: ['f301', 'f112', 'f822', 'f561', 'f300', 'f722', 'f408', 'f291', 'f384', 'f623', 'f956', 'f275', 'f716', 'f262', 'f800', 'f723', 'f466', 'f699', 'f985', 'f786', 'f340', 'f404', 'f1011', 'f547', 'f731', 'f735', 'f1006', 'f867', 'f220', 'f496', 'f524', 'f792', 'f691', 'f273', 'f298', 'f854', 'f143', 'f109', 'f572', 'f355', 'f750', 'f567', 'f899', 'Hb Value', 'f117', 'f202', 'f804', 'f648', 'f410', 'f303', 'f345', 'f249', 'f845', 'f915', 'f20', 'f721', 'f19', 'f312', 'f1000', 'f618', 'f624', 'f900', 'f930', 'f11', 'f59', 'f42', 'f1023', 'f377', 'f841', 'f888', 'f997', 'f909', 'f27', 'f990', 'f130', 'f586', 'f724', 'f366', 'f152', 'f80', 'f141', 'f855', 'f761', 'f686', 'f264', 'f688', 'f153', 'f856', 'f951', 'f829', 'f754', 'f656', 'f51', 'f614', 'f375', 'f441', 'f550', 'f292', 'f43', 'f576']
Skipping non-standard feature: Hb Value
Selected feature indices: [301, 112, 822, 561, 300, 722, 408, 291, 384, 623, 956, 275, 716, 262, 800, 723, 466, 699, 985, 786, 340, 404, 1011

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Slice the continuous features using the selected indices
X_train = train_features[:, selected_indices]
X_val   = val_features[:, selected_indices]
X_test  = test_features[:, selected_indices]

# Train the regression model on the training set
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, train_labels)

# Predict on the validation set
val_preds = regressor.predict(X_val)
val_mae = mean_absolute_error(val_labels, val_preds)
print("Validation MAE:", val_mae)

# Predict on the test set
test_preds = regressor.predict(X_test)
test_mae = mean_absolute_error(test_labels, test_preds)
print("Test MAE:", test_mae)


Validation MAE: 1.7732221975023783
Test MAE: 1.6717619604534568


In [13]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# Slice the continuous features using the selected indices from mRMR
X_train = train_features[:, selected_indices]
X_val   = val_features[:, selected_indices]
X_test  = test_features[:, selected_indices]

# Initialize XGBoost regressor with typical hyperparameters
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',  # For regression tasks
    n_estimators=200,               # Number of trees
    learning_rate=0.05,             # Step size shrinkage
    max_depth=6,                    # Maximum depth of a tree
    random_state=42
)

# Train the XGBoost model, using the validation set for early stopping
xgb_regressor.fit(
    X_train, 
    train_labels, 
    eval_set=[(X_val, val_labels)],
    early_stopping_rounds=10,
    verbose=True
)

# Make predictions on the validation and test sets
val_preds = xgb_regressor.predict(X_val)
test_preds = xgb_regressor.predict(X_test)

# Calculate Mean Absolute Error (MAE) for validation and test sets
val_mae = mean_absolute_error(val_labels, val_preds)
test_mae = mean_absolute_error(test_labels, test_preds)

print("Validation MAE:", val_mae)
print("Test MAE:", test_mae)


[0]	validation_0-rmse:2.03410
[1]	validation_0-rmse:2.01665
[2]	validation_0-rmse:1.99995
[3]	validation_0-rmse:1.98870
[4]	validation_0-rmse:1.97871
[5]	validation_0-rmse:1.97515
[6]	validation_0-rmse:1.97350
[7]	validation_0-rmse:1.96844
[8]	validation_0-rmse:1.96626
[9]	validation_0-rmse:1.96401
[10]	validation_0-rmse:1.96243
[11]	validation_0-rmse:1.96306
[12]	validation_0-rmse:1.96534
[13]	validation_0-rmse:1.96701




[14]	validation_0-rmse:1.96870
[15]	validation_0-rmse:1.97047
[16]	validation_0-rmse:1.97411
[17]	validation_0-rmse:1.97819
[18]	validation_0-rmse:1.98174
[19]	validation_0-rmse:1.98532
Validation MAE: 1.5638613
Test MAE: 1.5827963


In [14]:
!pip install ace_tools



In [15]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

# List of regressors to try
regressors = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "KNN": KNeighborsRegressor(),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

# Slice the continuous features using the selected indices
X_train = train_features[:, selected_indices]
X_val   = val_features[:, selected_indices]
X_test  = test_features[:, selected_indices]

# Dictionary to store MAE results
results = {}

# Iterate through each model
for name, model in regressors.items():
    print(f"Training {name}...")
    model.fit(X_train, train_labels)
    
    # Validation Predictions
    val_preds = model.predict(X_val)
    val_mae = mean_absolute_error(val_labels, val_preds)
    
    # Test Predictions
    test_preds = model.predict(X_test)
    test_mae = mean_absolute_error(test_labels, test_preds)
    
    results[name] = {"Validation MAE": val_mae, "Test MAE": test_mae}
    print(f"{name} - Validation MAE: {val_mae:.4f} | Test MAE: {test_mae:.4f}")

# Display results
import pandas as pd
results_df = pd.DataFrame(results).T
import ace_tools as tools; tools.display_dataframe_to_user(name="Regressor Model Performance", dataframe=results_df)


Training RandomForest...
RandomForest - Validation MAE: 1.7732 | Test MAE: 1.6718
Training GradientBoosting...
GradientBoosting - Validation MAE: 1.7722 | Test MAE: 1.6678
Training AdaBoost...
AdaBoost - Validation MAE: 1.8088 | Test MAE: 1.6810
Training LinearRegression...
LinearRegression - Validation MAE: 1.8534 | Test MAE: 1.6978
Training Ridge...
Ridge - Validation MAE: 1.8254 | Test MAE: 1.6672
Training Lasso...
Lasso - Validation MAE: 1.6306 | Test MAE: 1.6607
Training SVR...
SVR - Validation MAE: 1.7506 | Test MAE: 1.6390
Training DecisionTree...
DecisionTree - Validation MAE: 1.8841 | Test MAE: 1.7698
Training KNN...
KNN - Validation MAE: 1.7714 | Test MAE: 1.6162
Training XGBoost...
XGBoost - Validation MAE: 1.7310 | Test MAE: 1.7306
Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8484
[LightGBM] [Info] Number of d

ModuleNotFoundError: No module named 'ace_tools'