In [17]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load feature CSV and label CSV
features_df = pd.read_csv("face_features_40000_clean.csv")  # Replace with your file name
labels_df = pd.read_csv("/media/kathir/Apps and Games/prml project/illinois_doc_dataset/label.csv", sep = ';')      # Replace with your file name
labels_df['BMI'] = (labels_df['weight'] / labels_df['height']**2) * 703
labels_df= labels_df[['id', 'BMI']]
labels_df['id'] = labels_df['id'] + ".jpg"
# Merge on 'id' column
data = pd.merge(features_df, labels_df, on="id")

# Separate features and target
X = data.drop(columns=["id", "BMI"])  # Drop ID and target columns from features
y = data["BMI"]                      # Target variable

features_df.head()

Unnamed: 0.1,Unnamed: 0,id,front_feature_0,front_feature_1,front_feature_2,front_feature_3,front_feature_4,front_feature_5,front_feature_6,front_feature_7,...,side_feature_502,side_feature_503,side_feature_504,side_feature_505,side_feature_506,side_feature_507,side_feature_508,side_feature_509,side_feature_510,side_feature_511
0,0,A00147.jpg,0.02758,0.073103,0.034759,-0.012455,0.006706,0.103218,-0.028415,-0.027455,...,-0.005314,0.061346,-0.022165,-0.085434,-0.064719,-0.010489,0.067716,-0.039384,-0.080477,0.054623
1,1,A00360.jpg,0.048539,-0.075715,0.040924,0.047758,-0.059584,0.080212,0.013373,-0.003698,...,0.011348,-0.015338,-0.008936,0.056286,0.04114,0.092563,0.028029,-0.02518,-0.07203,0.011378
2,2,A00367.jpg,0.004419,0.022524,0.009931,-7.8e-05,0.028975,-0.013031,0.067987,-0.045147,...,0.019864,0.100233,-0.042817,0.012824,-0.039492,0.036514,0.010991,-0.079349,-0.033673,0.023258
3,3,A01072.jpg,0.057595,0.013416,-0.036035,0.038022,0.000146,0.000112,-0.008118,0.000618,...,-0.021967,-0.01182,0.006411,-0.00209,-0.026827,0.07794,-0.038535,0.007477,0.042835,-0.032234
4,4,A01077.jpg,-0.018371,0.024289,-0.093164,-0.026326,0.006232,-0.022597,0.041738,-0.012751,...,-0.001163,0.079152,-0.053166,0.063613,-0.012252,0.05064,-0.011309,-0.039858,0.022977,-0.025185


In [18]:
labels_df.head()

Unnamed: 0,id,BMI
0,A00147.jpg,28.971931
1,A00220.jpg,20.447551
2,A00360.jpg,24.658895
3,A00367.jpg,33.224344
4,A01054.jpg,25.996436


In [19]:
# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Apply PCA
n_components = 100  # Adjust to retain sufficient variance
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Explained variance by {n_components} components: {sum(pca.explained_variance_ratio_):.2f}")

# Step 4: Train XGBoost Regressor
xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42
)

# Cross-validation to estimate performance
cv_scores = cross_val_score(xgb_model, X_train_pca, y_train, cv=5, scoring='r2')
print(f"Cross-validated R^2 score: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

# Train on full training set
xgb_model.fit(X_train_pca, y_train)

# Step 5: Evaluate on the test set
y_pred = xgb_model.predict(X_test_pca)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Test R^2 score: {r2:.2f}")
print(f"Test RMSE: {rmse:.2f}")


Explained variance by 100 components: 1.00
Cross-validated R^2 score: 0.24 ± 0.01
Test R^2 score: 0.26
Test RMSE: 4.39
