In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/palm-all/Palm_Features.xlsx
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709808780105.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709795126494.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709809527125.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709619358067.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709638101111.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709703559788.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709884321143.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709886794732.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709790324469.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709977033943.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709969146179.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709711010517.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709887836179.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1709810564770.jpg
/kaggle/input/palm-all/Palm Masked/Palm Masked/1

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [3]:
# Read Excel file and select needed columns
df = pd.read_excel('/kaggle/input/palm-all/Palm_Features.xlsx')
df = df[['Image_Name', 'Hb Value']]

In [4]:
# Split data into train (approx. 70%), validation (approx. 15%), and test (15%)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.176, random_state=42)

In [5]:
# Function to load and preprocess images
def load_image_and_label(image_file, label):
    image = tf.io.read_file(image_file)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])  # Resize for ResNet50
    image = image / 255.0  # Normalize to [0, 1]
    return image, label

# Function to create TensorFlow datasets
def create_tf_dataset(df, images_folder, batch_size=16, shuffle=True):
    image_paths = [os.path.join(images_folder, img) for img in df['Image_Name'].values]
    labels = df['Hb Value'].values.astype(np.float32)
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(load_image_and_label, num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Define your image folder path and create datasets
images_folder = "/kaggle/input/palm-all/Palm Masked/Palm Masked"
train_ds = create_tf_dataset(train_df, images_folder, batch_size=16, shuffle=True)
val_ds   = create_tf_dataset(val_df,   images_folder, batch_size=16, shuffle=False)
test_ds  = create_tf_dataset(test_df,  images_folder, batch_size=16, shuffle=False)

In [6]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load ResNet50 base (without top) with pretrained ImageNet weights
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base model layers initially
for layer in base_model.layers:
    layer.trainable = False

# Build the regression head on top of the base model
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu', name='feature_dense')(x)  # Named layer for later feature extraction
x = Dropout(0.5)(x)
predictions = Dense(1, activation='linear')(x)  # Linear activation for regression

# Construct the full model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss='mean_absolute_error', metrics=['mae'])

# Train the new head (freeze base model)
history = model.fit(train_ds, validation_data=val_ds, epochs=50)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 504ms/step - loss: 9.6106 - mae: 9.6106 - val_loss: 3.7207 - val_mae: 3.7207
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 2.4221 - mae: 2.4221 - val_loss: 1.7293 - val_mae: 1.7293
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 1.9160 - mae: 1.9160 - val_loss: 1.4259 - val_mae: 1.4259
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 1.6805 - mae: 1.6805 - val_loss: 1.4295 - val_mae: 1.4295
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 1.7990 - mae: 1.7990 - val_loss: 1.4277 - val_mae: 1.4277
Epoch 6/50
[1m

In [7]:
# Unfreeze the last 20 layers of the base model for fine tuning
for layer in base_model.layers[-20:]:
    layer.trainable = True

# Re-compile the model with a lower learning rate for fine tuning
model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_absolute_error', metrics=['mae'])

# Fine tune the model further
history_ft = model.fit(train_ds, validation_data=val_ds, epochs=100)

# Optionally, evaluate the fine tuned model on the test dataset
test_mae = model.evaluate(test_ds)
print("Test MAE after fine tuning:", test_mae)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 531ms/step - loss: 3.3182 - mae: 3.3182 - val_loss: 1.5379 - val_mae: 1.5379
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 1.4969 - mae: 1.4969 - val_loss: 1.4393 - val_mae: 1.4393
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 1.5687 - mae: 1.5687 - val_loss: 1.4524 - val_mae: 1.4524
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 1.5048 - mae: 1.5048 - val_loss: 1.8566 - val_mae: 1.8566
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 1.4276 - mae: 1.4276 - val_loss: 2.6911 - val_mae: 2.6911
Epoch 6/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 1.2763 - mae: 1.2763 - val_loss: 3.5555 - val_mae: 3.5555
Epoch 7/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step 

In [8]:
# Create a feature extractor model that outputs the activations of the "feature_dense" layer
feature_extractor = Model(inputs=model.input, outputs=model.get_layer("feature_dense").output)

# Define a helper function to extract features from a TF dataset
def extract_features(dataset, extractor_model):
    features_list = []
    labels_list = []
    for batch_images, batch_labels in dataset:
        features = extractor_model.predict(batch_images)
        features_list.append(features)
        labels_list.append(batch_labels.numpy())
    features_array = np.concatenate(features_list, axis=0)
    labels_array = np.concatenate(labels_list, axis=0)
    return features_array, labels_array

# Extract features and labels from each dataset
train_features, train_labels = extract_features(train_ds, feature_extractor)
val_features, val_labels     = extract_features(val_ds, feature_extractor)
test_features, test_labels   = extract_features(test_ds, feature_extractor)

print("Train features shape:", train_features.shape)
print("Validation features shape:", val_features.shape)
print("Test features shape:", test_features.shape)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms

In [9]:
!pip install pymrmr

Collecting pymrmr
  Downloading pymrmr-0.1.11.tar.gz (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.5/69.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pymrmr
  Building wheel for pymrmr (setup.py) ... [?25l[?25hdone
  Created wheel for pymrmr: filename=pymrmr-0.1.11-cp310-cp310-linux_x86_64.whl size=390769 sha256=76a10bee92bfe443a786c7d7871d1a9de7430946609b4c60b3ef706f6d19c1b8
  Stored in directory: /root/.cache/pip/wheels/46/ae/55/4a2479c5f0de7eb363fe970cb18e4a750e03e4e63b1b5c2005
Successfully built pymrmr
Installing collected packages: pymrmr
Successfully installed pymrmr-0.1.11


In [10]:
import pandas as pd
import numpy as np
import pymrmr

# Assume 'train_features' and 'train_labels' are from the feature extractor
# Create a DataFrame with columns f0, f1, ..., and the target "Hb Value"
train_feat_df = pd.DataFrame(train_features, columns=[f"f{i}" for i in range(train_features.shape[1])])
train_feat_df['Hb Value'] = train_labels

# Discretize each feature column into 10 quantile bins.
for col in train_feat_df.columns[:-1]:  # Exclude the target column
    # qcut will bin the continuous values into 10 categories.
    train_feat_df[col] = pd.qcut(train_feat_df[col], q=10, duplicates='drop').cat.codes

# Discretize the target as well.
train_feat_df['Hb Value'] = pd.qcut(train_feat_df['Hb Value'], q=10, duplicates='drop').cat.codes


In [11]:
# Define the number of top features you wish to select
num_top_features = 100

# Run mRMR to select features
selected_features = pymrmr.mRMR(train_feat_df, 'MIQ', num_top_features)
print("Selected features:", selected_features)

# Convert feature names (e.g., "f23") to column indices
selected_indices = []

for feat in selected_features:
    # Ensure the feature name starts with 'f' and is followed by digits
    if feat.startswith('f') and feat[1:].isdigit():
        selected_indices.append(int(feat[1:]))
    else:
        print(f"Skipping non-standard feature: {feat}")

print("Selected feature indices:", selected_indices)


Selected features: ['f1019', 'f432', 'f469', 'f287', 'f856', 'f928', 'f774', 'f181', 'f325', 'f891', 'f523', 'f803', 'f591', 'f529', 'f851', 'f274', 'f350', 'f632', 'f415', 'f70', 'f598', 'f347', 'f697', 'f109', 'f294', 'f526', 'f814', 'f346', 'f291', 'f933', 'f88', 'f288', 'f980', 'f258', 'f369', 'f463', 'f284', 'f134', 'f372', 'f755', 'f236', 'f866', 'f655', 'f776', 'f551', 'f301', 'f815', 'f454', 'f1008', 'f1007', 'f61', 'f903', 'f43', 'f1013', 'f731', 'f841', 'f462', 'f419', 'f337', 'f977', 'f705', 'f253', 'f379', 'f837', 'f978', 'f939', 'f378', 'f336', 'f563', 'f72', 'f94', 'f116', 'f669', 'f459', 'f890', 'f859', 'f63', 'f657', 'f656', 'f635', 'f677', 'f196', 'f581', 'f138', 'f53', 'f354', 'f200', 'f622', 'f854', 'f217', 'f773', 'f796', 'f173', 'f11', 'f542', 'f45', 'f424', 'f690', 'f548', 'f627']
Selected feature indices: [1019, 432, 469, 287, 856, 928, 774, 181, 325, 891, 523, 803, 591, 529, 851, 274, 350, 632, 415, 70, 598, 347, 697, 109, 294, 526, 814, 346, 291, 933, 88, 288, 

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Slice the continuous features using the selected indices
X_train = train_features[:, selected_indices]
X_val   = val_features[:, selected_indices]
X_test  = test_features[:, selected_indices]

# Train the regression model on the training set
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, train_labels)

# Predict on the validation set
val_preds = regressor.predict(X_val)
val_mae = mean_absolute_error(val_labels, val_preds)
print("Validation MAE:", val_mae)

# Predict on the test set
test_preds = regressor.predict(X_test)
test_mae = mean_absolute_error(test_labels, test_preds)
print("Test MAE:", test_mae)


Validation MAE: 1.4889365054312205
Test MAE: 1.6774762352686081


In [13]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# Slice the continuous features using the selected indices from mRMR
X_train = train_features[:, selected_indices]
X_val   = val_features[:, selected_indices]
X_test  = test_features[:, selected_indices]

# Initialize XGBoost regressor with typical hyperparameters
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',  # For regression tasks
    n_estimators=200,               # Number of trees
    learning_rate=0.05,             # Step size shrinkage
    max_depth=6,                    # Maximum depth of a tree
    random_state=42
)

# Train the XGBoost model, using the validation set for early stopping
xgb_regressor.fit(
    X_train, 
    train_labels, 
    eval_set=[(X_val, val_labels)],
    early_stopping_rounds=10,
    verbose=True
)

# Make predictions on the validation and test sets
val_preds = xgb_regressor.predict(X_val)
test_preds = xgb_regressor.predict(X_test)

# Calculate Mean Absolute Error (MAE) for validation and test sets
val_mae = mean_absolute_error(val_labels, val_preds)
test_mae = mean_absolute_error(test_labels, test_preds)

print("Validation MAE:", val_mae)
print("Test MAE:", test_mae)


[0]	validation_0-rmse:1.72331
[1]	validation_0-rmse:1.71413
[2]	validation_0-rmse:1.71001
[3]	validation_0-rmse:1.70452
[4]	validation_0-rmse:1.70244
[5]	validation_0-rmse:1.70053
[6]	validation_0-rmse:1.70366
[7]	validation_0-rmse:1.70115
[8]	validation_0-rmse:1.70066
[9]	validation_0-rmse:1.69961
[10]	validation_0-rmse:1.70113
[11]	validation_0-rmse:1.70257
[12]	validation_0-rmse:1.70084
[13]	validation_0-rmse:1.70491
[14]	validation_0-rmse:1.70815




[15]	validation_0-rmse:1.71002
[16]	validation_0-rmse:1.71146
[17]	validation_0-rmse:1.71643
[18]	validation_0-rmse:1.72010
Validation MAE: 1.4485395
Test MAE: 1.6692183


In [14]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


In [15]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

# List of regressors to try
regressors = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "KNN": KNeighborsRegressor(),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

# Slice the continuous features using the selected indices
X_train = train_features[:, selected_indices]
X_val   = val_features[:, selected_indices]
X_test  = test_features[:, selected_indices]

# Dictionary to store MAE results
results = {}

# Iterate through each model
for name, model in regressors.items():
    print(f"Training {name}...")
    model.fit(X_train, train_labels)
    
    # Validation Predictions
    val_preds = model.predict(X_val)
    val_mae = mean_absolute_error(val_labels, val_preds)
    
    # Test Predictions
    test_preds = model.predict(X_test)
    test_mae = mean_absolute_error(test_labels, test_preds)
    
    results[name] = {"Validation MAE": val_mae, "Test MAE": test_mae}
    print(f"{name} - Validation MAE: {val_mae:.4f} | Test MAE: {test_mae:.4f}")

# Display results
import pandas as pd
results_df = pd.DataFrame(results).T
import ace_tools as tools; tools.display_dataframe_to_user(name="Regressor Model Performance", dataframe=results_df)


Training RandomForest...
RandomForest - Validation MAE: 1.4889 | Test MAE: 1.6775
Training GradientBoosting...
GradientBoosting - Validation MAE: 1.5165 | Test MAE: 1.6581
Training AdaBoost...
AdaBoost - Validation MAE: 1.4912 | Test MAE: 1.7139
Training LinearRegression...
LinearRegression - Validation MAE: 1.5522 | Test MAE: 1.6106
Training Ridge...
Ridge - Validation MAE: 1.5024 | Test MAE: 1.6202
Training Lasso...
Lasso - Validation MAE: 1.4666 | Test MAE: 1.7984
Training SVR...
SVR - Validation MAE: 1.5038 | Test MAE: 1.6224
Training DecisionTree...
DecisionTree - Validation MAE: 1.5397 | Test MAE: 1.7889
Training KNN...
KNN - Validation MAE: 1.5368 | Test MAE: 1.6184
Training XGBoost...
XGBoost - Validation MAE: 1.4881 | Test MAE: 1.7074
Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8201
[LightGBM] [Info] Number of d

ModuleNotFoundError: No module named 'ace_tools'