In [2]:
!pip install pandas numpy scikit-learn transformers lightgbm codecarbon

Collecting codecarbon
  Downloading codecarbon-2.8.3-py3-none-any.whl.metadata (8.7 kB)
Collecting arrow (from codecarbon)
  Downloading arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting fief-client[cli] (from codecarbon)
  Downloading fief_client-0.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting questionary (from codecarbon)
  Downloading questionary-2.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting rapidfuzz (from codecarbon)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting types-python-dateutil>=2.8.10 (from arrow->codecarbon)
  Downloading types_python_dateutil-2.9.0.20241206-py3-none-any.whl.metadata (2.1 kB)
Collecting httpx<0.28.0,>=0.21.3 (from fief-client[cli]->codecarbon)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jwcrypto<2.0.0,>=1.4 (from fief-client[cli]->codecarbon)
  Downloading jwcrypto-1.5.6-py3-none-any.whl.metadata (3.1 kB)
Collecting yaspin (from fief-clie

### **Logistic Regression + LightGBM 2 - Best Model **

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from codecarbon import EmissionsTracker
import logging
import os
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.ensemble import VotingClassifier

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Enhanced Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords and lemmatize
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize text data using TF-IDF with character n-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))  # Reduced max_features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression + LightGBM ensemble for each binary attribute
results_ensemble = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training Ensemble (Logistic Regression + LightGBM) for attribute: {col}")

    # Calculate class weights for imbalanced data
    class_weights = {0: 1, 1: len(y_train[col]) / sum(y_train[col])}  # Higher weight for minority class

    # Define Logistic Regression model
    lr_model = LogisticRegression(
        class_weight="balanced",  # Handle imbalanced classes
        max_iter=500,           # Reduced iterations
        n_jobs=-1,               # Use all cores
        random_state=42
    )

    # Define LightGBM model
    lgb_model = lgb.LGBMClassifier(
        n_estimators=100,  # Reduced number of trees
        learning_rate=0.1,  # Slightly higher learning rate for faster convergence
        max_depth=3,        # Reduced depth to prevent overfitting
        random_state=42,
        n_jobs=-1,          # Use all available CPU cores
        class_weight=class_weights,  # Handle class imbalance
        verbosity=-1,  # Suppress LightGBM warnings
        subsample=0.8,  # Subsample to reduce computation
        colsample_bytree=0.8  # Feature subsampling to reduce computation
    )

    # Create Voting Classifier (ensemble of Logistic Regression and LightGBM)
    ensemble_model = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('lgb', lgb_model)
        ],
        voting='soft',  # Use soft voting for probabilistic predictions
        n_jobs=-1       # Use all cores
    )

    # Train the ensemble model
    ensemble_model.fit(X_train_tfidf, y_train[col])

    # Predict on test set
    y_pred = ensemble_model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_ensemble.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_ensemble_df = pd.DataFrame(results_ensemble, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_ensemble = results_ensemble_df["Accuracy"].mean()
avg_f1_ensemble = results_ensemble_df["F1"].mean()
avg_recall_ensemble = results_ensemble_df["Recall"].mean()
avg_precision_ensemble = results_ensemble_df["Precision"].mean()

# Add average row to the results table
results_ensemble_df.loc["Average"] = ["Average", avg_accuracy_ensemble, avg_f1_ensemble, avg_recall_ensemble, avg_precision_ensemble]

# Display results
print("Ensemble (Logistic Regression + LightGBM) Results:")
print(results_ensemble_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training Ensemble (Logistic Regression + LightGBM) for attribute: dark_pigmentation
Training Ensemble (Logistic Regression + LightGBM) for attribute: acne
Training Ensemble (Logistic Regression + LightGBM) for attribute: eye_contour
Training Ensemble (Logistic Regression + LightGBM) for attribute: homogeneity
Training Ensemble (Logistic Regression + LightGBM) for attribute: lack_firmness
Training Ensemble (Logistic Regression + LightGBM) for attribute: lack_radiance
Training Ensemble (Logistic Regression + LightGBM) for attribute: pores
Training Ensemble (Logistic Regression + LightGBM) for attribute: fine_lines
Training Ensemble (Logistic Regression + LightGBM) for attribute: wrinkles_fine-lines
Training Ensemble (Logistic Regression + LightGBM) for attribute: eye-wrinkles
Training Ensemble (Logistic Regression + LightGBM) for attribute: undereye-bags
Training Ensemble (Logistic Regression + LightGBM) for attribute: generic
Training Ensemble (Logistic Regression + LightGBM) for attrib

### **LightGBM**

In [None]:
!pip install pandas numpy scikit-learn transformers lightgbm codecarbon
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from codecarbon import EmissionsTracker
import logging
import os
import warnings

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Clean the 'text_raw' column
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF with fewer features
vectorizer = TfidfVectorizer(max_features=3000)  # Reduced to 3000 features for efficiency
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train LightGBM model for each binary attribute with optimized hyperparameters
results_lgbm = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training LightGBM for attribute: {col}")

    # Calculate class weights for imbalanced data
    class_weights = {0: 1, 1: len(y_train[col]) / sum(y_train[col])}  # Higher weight for minority class

    # Train LightGBM model with optimized hyperparameters
    model = lgb.LGBMClassifier(
        n_estimators=150,  # Reduced number of trees
        learning_rate=0.1,  # Slightly higher learning rate for faster convergence
        max_depth=5,        # Reduced depth to prevent overfitting
        random_state=42,
        n_jobs=-1,          # Use all available CPU cores
        class_weight=class_weights,  # Handle class imbalance
        verbosity=-1,  # Suppress LightGBM warnings
        subsample=0.8,  # Subsample to reduce computation
        colsample_bytree=0.8  # Feature subsampling to reduce computation
    )

    # Use early stopping with callbacks
    callbacks = [
        lgb.early_stopping(stopping_rounds=10, verbose=False),  # Early stopping
        lgb.log_evaluation(period=0)  # Disable logging
    ]

    model.fit(
        X_train_tfidf,
        y_train[col],
        eval_set=[(X_test_tfidf, y_test[col])],
        callbacks=callbacks
    )

    # Predict on test set
    y_pred = model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_lgbm.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_lgbm_df = pd.DataFrame(results_lgbm, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_lgbm = results_lgbm_df["Accuracy"].mean()
avg_f1_lgbm = results_lgbm_df["F1"].mean()
avg_recall_lgbm = results_lgbm_df["Recall"].mean()
avg_precision_lgbm = results_lgbm_df["Precision"].mean()

# Add average row to the results table
results_lgbm_df.loc["Average"] = ["Average", avg_accuracy_lgbm, avg_f1_lgbm, avg_recall_lgbm, avg_precision_lgbm]

# Display results
print("LightGBM Results:")
print(results_lgbm_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")

Collecting codecarbon
  Downloading codecarbon-2.8.3-py3-none-any.whl.metadata (8.7 kB)
Collecting arrow (from codecarbon)
  Downloading arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting fief-client[cli] (from codecarbon)
  Downloading fief_client-0.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting questionary (from codecarbon)
  Downloading questionary-2.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting rapidfuzz (from codecarbon)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting types-python-dateutil>=2.8.10 (from arrow->codecarbon)
  Downloading types_python_dateutil-2.9.0.20241206-py3-none-any.whl.metadata (2.1 kB)
Collecting httpx<0.28.0,>=0.21.3 (from fief-client[cli]->codecarbon)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jwcrypto<2.0.0,>=1.4 (from fief-client[cli]->codecarbon)
  Downloading jwcrypto-1.5.6-py3-none-any.whl.metadata (3.1 kB)
Collecting yaspin (from fief-clie



Training LightGBM for attribute: dark_pigmentation
Training LightGBM for attribute: acne
Training LightGBM for attribute: eye_contour
Training LightGBM for attribute: homogeneity
Training LightGBM for attribute: lack_firmness
Training LightGBM for attribute: lack_radiance
Training LightGBM for attribute: pores
Training LightGBM for attribute: fine_lines
Training LightGBM for attribute: wrinkles_fine-lines
Training LightGBM for attribute: eye-wrinkles
Training LightGBM for attribute: undereye-bags
Training LightGBM for attribute: generic
Training LightGBM for attribute: 18-34
Training LightGBM for attribute: 35-54
Training LightGBM for attribute: 55-99
Training LightGBM for attribute: dry
Training LightGBM for attribute: normal
Training LightGBM for attribute: oily
Training LightGBM for attribute: combination
Training LightGBM for attribute: sensitivity-high
Training LightGBM for attribute: sensitivity-low
Training LightGBM for attribute: no_sensitivity
Training LightGBM for attribute: 

### **Logistic Regression + LightGBM 1**

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from codecarbon import EmissionsTracker
import logging
import os
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.ensemble import VotingClassifier

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Enhanced Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords and lemmatize
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF with character n-grams
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Use character n-grams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression + LightGBM ensemble for each binary attribute
results_ensemble = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training Ensemble (Logistic Regression + LightGBM) for attribute: {col}")

    # Calculate class weights for imbalanced data
    class_weights = {0: 1, 1: len(y_train[col]) / sum(y_train[col])}  # Higher weight for minority class

    # Define Logistic Regression model
    lr_model = LogisticRegression(
        class_weight="balanced",  # Handle imbalanced classes
        max_iter=1000,           # Increase iterations for convergence
        n_jobs=-1,               # Use all cores
        random_state=42
    )

    # Define LightGBM model
    lgb_model = lgb.LGBMClassifier(
        n_estimators=150,  # Reduced number of trees
        learning_rate=0.1,  # Slightly higher learning rate for faster convergence
        max_depth=5,        # Reduced depth to prevent overfitting
        random_state=42,
        n_jobs=-1,          # Use all available CPU cores
        class_weight=class_weights,  # Handle class imbalance
        verbosity=-1,  # Suppress LightGBM warnings
        subsample=0.8,  # Subsample to reduce computation
        colsample_bytree=0.8  # Feature subsampling to reduce computation
    )

    # Create Voting Classifier (ensemble of Logistic Regression and LightGBM)
    ensemble_model = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('lgb', lgb_model)
        ],
        voting='soft',  # Use soft voting for probabilistic predictions
        n_jobs=-1       # Use all cores
    )

    # Train the ensemble model
    ensemble_model.fit(X_train_tfidf, y_train[col])

    # Predict on test set
    y_pred = ensemble_model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_ensemble.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_ensemble_df = pd.DataFrame(results_ensemble, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_ensemble = results_ensemble_df["Accuracy"].mean()
avg_f1_ensemble = results_ensemble_df["F1"].mean()
avg_recall_ensemble = results_ensemble_df["Recall"].mean()
avg_precision_ensemble = results_ensemble_df["Precision"].mean()

# Add average row to the results table
results_ensemble_df.loc["Average"] = ["Average", avg_accuracy_ensemble, avg_f1_ensemble, avg_recall_ensemble, avg_precision_ensemble]

# Display results
print("Ensemble (Logistic Regression + LightGBM) Results:")
print(results_ensemble_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training Ensemble (Logistic Regression + LightGBM) for attribute: dark_pigmentation
Training Ensemble (Logistic Regression + LightGBM) for attribute: acne
Training Ensemble (Logistic Regression + LightGBM) for attribute: eye_contour
Training Ensemble (Logistic Regression + LightGBM) for attribute: homogeneity
Training Ensemble (Logistic Regression + LightGBM) for attribute: lack_firmness
Training Ensemble (Logistic Regression + LightGBM) for attribute: lack_radiance
Training Ensemble (Logistic Regression + LightGBM) for attribute: pores
Training Ensemble (Logistic Regression + LightGBM) for attribute: fine_lines
Training Ensemble (Logistic Regression + LightGBM) for attribute: wrinkles_fine-lines
Training Ensemble (Logistic Regression + LightGBM) for attribute: eye-wrinkles
Training Ensemble (Logistic Regression + LightGBM) for attribute: undereye-bags
Training Ensemble (Logistic Regression + LightGBM) for attribute: generic
Training Ensemble (Logistic Regression + LightGBM) for attrib

### **Logistic Regression + LightGBM + Random**

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from codecarbon import EmissionsTracker
import logging
import os
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Enhanced Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords and lemmatize
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF with character n-grams
vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))  # Reduced max_features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression + LightGBM + Random Forest + XGBoost ensemble for each binary attribute
results_ensemble = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: {col}")

    # Calculate class weights for imbalanced data
    class_weights = {0: 1, 1: len(y_train[col]) / sum(y_train[col])}  # Higher weight for minority class

    # Define Logistic Regression model with hyperparameter tuning
    lr_model = LogisticRegression(
        class_weight="balanced",  # Handle imbalanced classes
        max_iter=200,           # Reduced iterations
        n_jobs=-1,               # Use all cores
        random_state=42
    )

    # Define LightGBM model with hyperparameter tuning
    lgb_model = lgb.LGBMClassifier(
        n_estimators=50,  # Reduced number of trees
        learning_rate=0.05,  # Lower learning rate for better generalization
        max_depth=2,        # Reduced depth to prevent overfitting
        random_state=42,
        n_jobs=-1,          # Use all available CPU cores
        class_weight=class_weights,  # Handle class imbalance
        verbosity=-1,  # Suppress LightGBM warnings
        subsample=0.8,  # Subsample to reduce computation
        colsample_bytree=0.8  # Feature subsampling to reduce computation
    )

    # Define Random Forest model with hyperparameter tuning
    rf_model = RandomForestClassifier(
        n_estimators=50,  # Reduced number of trees
        max_depth=2,      # Reduced depth to prevent overfitting
        class_weight=class_weights,  # Handle class imbalance
        random_state=42,
        n_jobs=-1         # Use all available CPU cores
    )

    # Define XGBoost model with hyperparameter tuning
    xgb_model = XGBClassifier(
        n_estimators=50,  # Reduced number of trees
        learning_rate=0.05,  # Lower learning rate for better generalization
        max_depth=2,      # Reduced depth to prevent overfitting
        random_state=42,
        n_jobs=-1,        # Use all available CPU cores
        scale_pos_weight=len(y_train[col]) / sum(y_train[col]),  # Handle class imbalance
        subsample=0.8,    # Subsample to reduce computation
        colsample_bytree=0.8  # Feature subsampling to reduce computation
    )

    # Create Voting Classifier (ensemble of Logistic Regression, LightGBM, Random Forest, and XGBoost)
    ensemble_model = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('lgb', lgb_model),
            ('rf', rf_model),
            ('xgb', xgb_model)
        ],
        voting='soft',  # Use soft voting for probabilistic predictions
        n_jobs=-1       # Use all cores
    )

    # Train the ensemble model
    ensemble_model.fit(X_train_tfidf, y_train[col])

    # Predict on test set
    y_pred = ensemble_model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_ensemble.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_ensemble_df = pd.DataFrame(results_ensemble, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_ensemble = results_ensemble_df["Accuracy"].mean()
avg_f1_ensemble = results_ensemble_df["F1"].mean()
avg_recall_ensemble = results_ensemble_df["Recall"].mean()
avg_precision_ensemble = results_ensemble_df["Precision"].mean()

# Add average row to the results table
results_ensemble_df.loc["Average"] = ["Average", avg_accuracy_ensemble, avg_f1_ensemble, avg_recall_ensemble, avg_precision_ensemble]

# Display results
print("Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) Results:")
print(results_ensemble_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: dark_pigmentation
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: acne
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: eye_contour
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: homogeneity
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: lack_firmness
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: lack_radiance
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: pores
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: fine_lines
Training Ensemble (Logistic Regression + LightGBM + Random Forest + XGBoost) for attribute: wrinkles_fine-lines
Training Ensemble (Logistic Regression + LightGBM + Random F

### ***Logistic Regression ***

In [None]:

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from codecarbon import EmissionsTracker
import logging
import os
import warnings

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Clean the 'text_raw' column
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression model for each binary attribute
results_lr = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training Logistic Regression for attribute: {col}")

    # Train Logistic Regression model
    model = LogisticRegression(
        class_weight="balanced",  # Handle imbalanced classes
        max_iter=1000,           # Increase iterations for convergence
        n_jobs=-1,               # Use all cores
        random_state=42
    )
    model.fit(X_train_tfidf, y_train[col])

    # Predict on test set
    y_pred = model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_lr.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_lr_df = pd.DataFrame(results_lr, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_lr = results_lr_df["Accuracy"].mean()
avg_f1_lr = results_lr_df["F1"].mean()
avg_recall_lr = results_lr_df["Recall"].mean()
avg_precision_lr = results_lr_df["Precision"].mean()

# Add average row to the results table
results_lr_df.loc["Average"] = ["Average", avg_accuracy_lr, avg_f1_lr, avg_recall_lr, avg_precision_lr]

# Display results
print("Logistic Regression Results:")
print(results_lr_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")



Training Logistic Regression for attribute: dark_pigmentation
Training Logistic Regression for attribute: acne
Training Logistic Regression for attribute: eye_contour
Training Logistic Regression for attribute: homogeneity
Training Logistic Regression for attribute: lack_firmness
Training Logistic Regression for attribute: lack_radiance
Training Logistic Regression for attribute: pores
Training Logistic Regression for attribute: fine_lines
Training Logistic Regression for attribute: wrinkles_fine-lines
Training Logistic Regression for attribute: eye-wrinkles
Training Logistic Regression for attribute: undereye-bags
Training Logistic Regression for attribute: generic
Training Logistic Regression for attribute: 18-34
Training Logistic Regression for attribute: 35-54
Training Logistic Regression for attribute: 55-99
Training Logistic Regression for attribute: dry
Training Logistic Regression for attribute: normal
Training Logistic Regression for attribute: oily
Training Logistic Regressio

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### **LightGBM 2**

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from codecarbon import EmissionsTracker
import logging
import os
import warnings

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Clean the 'text_raw' column
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF with fewer features
vectorizer = TfidfVectorizer(max_features=3000)  # Reduced to 3000 features for efficiency
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train LightGBM model for each binary attribute with optimized hyperparameters
results_lgbm = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training LightGBM for attribute: {col}")

    # Calculate class weights for imbalanced data
    class_weights = {0: 1, 1: len(y_train[col]) / sum(y_train[col])}  # Higher weight for minority class

    # Train LightGBM model with optimized hyperparameters
    model = lgb.LGBMClassifier(
        n_estimators=150,  # Reduced number of trees
        learning_rate=0.1,  # Slightly higher learning rate for faster convergence
        max_depth=5,        # Reduced depth to prevent overfitting
        random_state=42,
        n_jobs=-1,          # Use all available CPU cores
        class_weight=class_weights,  # Handle class imbalance
        verbosity=-1,  # Suppress LightGBM warnings
        subsample=0.8,  # Subsample to reduce computation
        colsample_bytree=0.8  # Feature subsampling to reduce computation
    )

    # Use early stopping with callbacks
    callbacks = [
        lgb.early_stopping(stopping_rounds=10, verbose=False),  # Early stopping
        lgb.log_evaluation(period=0)  # Disable logging
    ]

    model.fit(
        X_train_tfidf,
        y_train[col],
        eval_set=[(X_test_tfidf, y_test[col])],
        callbacks=callbacks
    )

    # Predict on test set
    y_pred = model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_lgbm.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_lgbm_df = pd.DataFrame(results_lgbm, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_lgbm = results_lgbm_df["Accuracy"].mean()
avg_f1_lgbm = results_lgbm_df["F1"].mean()
avg_recall_lgbm = results_lgbm_df["Recall"].mean()
avg_precision_lgbm = results_lgbm_df["Precision"].mean()

# Add average row to the results table
results_lgbm_df.loc["Average"] = ["Average", avg_accuracy_lgbm, avg_f1_lgbm, avg_recall_lgbm, avg_precision_lgbm]

# Display results
print("LightGBM Results:")
print(results_lgbm_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")



Training LightGBM for attribute: dark_pigmentation
Training LightGBM for attribute: acne
Training LightGBM for attribute: eye_contour
Training LightGBM for attribute: homogeneity
Training LightGBM for attribute: lack_firmness
Training LightGBM for attribute: lack_radiance
Training LightGBM for attribute: pores
Training LightGBM for attribute: fine_lines
Training LightGBM for attribute: wrinkles_fine-lines
Training LightGBM for attribute: eye-wrinkles
Training LightGBM for attribute: undereye-bags
Training LightGBM for attribute: generic
Training LightGBM for attribute: 18-34
Training LightGBM for attribute: 35-54
Training LightGBM for attribute: 55-99
Training LightGBM for attribute: dry
Training LightGBM for attribute: normal
Training LightGBM for attribute: oily
Training LightGBM for attribute: combination
Training LightGBM for attribute: sensitivity-high
Training LightGBM for attribute: sensitivity-low
Training LightGBM for attribute: no_sensitivity
Training LightGBM for attribute: 

### **SVM**

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from codecarbon import EmissionsTracker
import logging
import os
import warnings

# Suppress warnings and logs
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Remove CodeCarbon lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Start CodeCarbon tracker
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)
tracker.start()

# Load dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Define features and labels
X = df["text_raw"]
binary_columns = df.columns[1:34]
y = df[binary_columns]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization with feature selection
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for efficiency
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# SVM training with optimizations
results_svm = []
y_pred_all = []

for col in binary_columns:
    print(f"Training SVM for attribute: {col}")

    # Train SVM model
    model = SVC(kernel="linear", class_weight="balanced", probability=True)
    model.fit(X_train_tfidf, y_train[col])

    # Predict and store results
    y_pred = model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)

    # Compute metrics
    results_svm.append([
        col,
        accuracy_score(y_test[col], y_pred),
        f1_score(y_test[col], y_pred, average='binary'),
        recall_score(y_test[col], y_pred, average='binary'),
        precision_score(y_test[col], y_pred, average='binary')
    ])

# Convert results to DataFrame
results_svm_df = pd.DataFrame(results_svm, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Compute and append averages
avg_metrics = results_svm_df.mean(numeric_only=True)
results_svm_df.loc["Average"] = ["Average"] + avg_metrics.tolist()

# Display results
print("Optimized SVM Results:")
print(results_svm_df)

# Convert predictions to a structured array
y_pred_all = np.array(y_pred_all).T

# Generate classification report
classification_report_result = classification_report(y_test, y_pred_all, target_names=binary_columns, output_dict=True)

# Extract key averages
for avg_type in ["macro avg", "weighted avg"]:
    print(f"\n{avg_type.capitalize()}:")
    print(f"Precision: {classification_report_result[avg_type]['precision']:.2f}, "
          f"Recall: {classification_report_result[avg_type]['recall']:.2f}, "
          f"F1: {classification_report_result[avg_type]['f1-score']:.2f}")

# Stop emissions tracker
emissions = tracker.stop() or 0.0
print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")




Training SVM for attribute: dark_pigmentation
Training SVM for attribute: acne
Training SVM for attribute: eye_contour
Training SVM for attribute: homogeneity
Training SVM for attribute: lack_firmness
Training SVM for attribute: lack_radiance
Training SVM for attribute: pores
Training SVM for attribute: fine_lines
Training SVM for attribute: wrinkles_fine-lines
Training SVM for attribute: eye-wrinkles
Training SVM for attribute: undereye-bags
Training SVM for attribute: generic
Training SVM for attribute: 18-34
Training SVM for attribute: 35-54
Training SVM for attribute: 55-99
Training SVM for attribute: dry
Training SVM for attribute: normal
Training SVM for attribute: oily
Training SVM for attribute: combination
Training SVM for attribute: sensitivity-high
Training SVM for attribute: sensitivity-low
Training SVM for attribute: no_sensitivity
Training SVM for attribute: male
Training SVM for attribute: female
Training SVM for attribute: cleanse
Training SVM for attribute: prepare
Tra

### **Gradient Boosting**

In [4]:


import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from codecarbon import EmissionsTracker
import logging
import os
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Enhanced Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords and lemmatize
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF with character n-grams
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Use character n-grams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Handle imbalanced data using SMOTE (apply SMOTE on each label separately)
smote = SMOTE(random_state=42)

# Initialize lists to store resampled training data and labels for each column
X_train_res_list = []
y_train_res_list = []

# Apply SMOTE to each binary target column
for col in binary_columns:
    X_res, y_res = smote.fit_resample(X_train_tfidf, y_train[col])
    X_train_res_list.append(X_res)
    y_train_res_list.append(y_res)

# Train Gradient Boosting model for each binary attribute
results_gb = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training Gradient Boosting for attribute: {col}")

    # Train Gradient Boosting model on resampled data
    model = GradientBoostingClassifier(
        n_estimators=200,  # Increase number of trees
        learning_rate=0.05,  # Lower learning rate for better generalization
        max_depth=5,        # Limit depth to prevent overfitting
        random_state=42
    )

    model.fit(X_train_res_list[i], y_train_res_list[i])

    # Predict on test set
    y_pred = model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_gb.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_gb_df = pd.DataFrame(results_gb, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_gb = results_gb_df["Accuracy"].mean()
avg_f1_gb = results_gb_df["F1"].mean()
avg_recall_gb = results_gb_df["Recall"].mean()
avg_precision_gb = results_gb_df["Precision"].mean()

# Add average row to the results table
results_gb_df.loc["Average"] = ["Average", avg_accuracy_gb, avg_f1_gb, avg_recall_gb, avg_precision_gb]

# Display results
print("Gradient Boosting Results:")
print(results_gb_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training Gradient Boosting for attribute: dark_pigmentation
Training Gradient Boosting for attribute: acne
Training Gradient Boosting for attribute: eye_contour
Training Gradient Boosting for attribute: homogeneity
Training Gradient Boosting for attribute: lack_firmness
Training Gradient Boosting for attribute: lack_radiance
Training Gradient Boosting for attribute: pores
Training Gradient Boosting for attribute: fine_lines
Training Gradient Boosting for attribute: wrinkles_fine-lines
Training Gradient Boosting for attribute: eye-wrinkles
Training Gradient Boosting for attribute: undereye-bags
Training Gradient Boosting for attribute: generic
Training Gradient Boosting for attribute: 18-34
Training Gradient Boosting for attribute: 35-54
Training Gradient Boosting for attribute: 55-99
Training Gradient Boosting for attribute: dry
Training Gradient Boosting for attribute: normal
Training Gradient Boosting for attribute: oily
Training Gradient Boosting for attribute: combination
Training 

### **Logistic Regression + LightGBM 3**

In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from codecarbon import EmissionsTracker
import logging
import os
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.ensemble import VotingClassifier

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Enhanced Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords and lemmatize
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF with character n-grams
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Use character n-grams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression + LightGBM ensemble for each binary attribute
results_ensemble = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training Ensemble (Logistic Regression + LightGBM) for attribute: {col}")

    # Calculate class weights for imbalanced data
    class_weights = {0: 1, 1: len(y_train[col]) / sum(y_train[col])}  # Higher weight for minority class

    # Define Logistic Regression model
    lr_model = LogisticRegression(
        class_weight="balanced",  # Handle imbalanced classes
        max_iter=1000,           # Increase iterations for convergence
        n_jobs=-1,               # Use all cores
        random_state=42
    )

    # Define LightGBM model
    lgb_model = lgb.LGBMClassifier(
        n_estimators=150,  # Reduced number of trees
        learning_rate=0.1,  # Slightly higher learning rate for faster convergence
        max_depth=5,        # Reduced depth to prevent overfitting
        random_state=42,
        n_jobs=-1,          # Use all available CPU cores
        class_weight=class_weights,  # Handle class imbalance
        verbosity=-1,  # Suppress LightGBM warnings
        subsample=0.8,  # Subsample to reduce computation
        colsample_bytree=0.8  # Feature subsampling to reduce computation
    )

    # Create Voting Classifier (ensemble of Logistic Regression and LightGBM)
    ensemble_model = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('lgb', lgb_model)
        ],
        voting='soft',  # Use soft voting for probabilistic predictions
        n_jobs=-1       # Use all cores
    )

    # Train the ensemble model
    ensemble_model.fit(X_train_tfidf, y_train[col])

    # Predict on test set
    y_pred = ensemble_model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_ensemble.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_ensemble_df = pd.DataFrame(results_ensemble, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_ensemble = results_ensemble_df["Accuracy"].mean()
avg_f1_ensemble = results_ensemble_df["F1"].mean()
avg_recall_ensemble = results_ensemble_df["Recall"].mean()
avg_precision_ensemble = results_ensemble_df["Precision"].mean()

# Add average row to the results table
results_ensemble_df.loc["Average"] = ["Average", avg_accuracy_ensemble, avg_f1_ensemble, avg_recall_ensemble, avg_precision_ensemble]

# Display results
print("Ensemble (Logistic Regression + LightGBM) Results:")
print(results_ensemble_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training Ensemble (Logistic Regression + LightGBM) for attribute: dark_pigmentation
Training Ensemble (Logistic Regression + LightGBM) for attribute: acne
Training Ensemble (Logistic Regression + LightGBM) for attribute: eye_contour
Training Ensemble (Logistic Regression + LightGBM) for attribute: homogeneity
Training Ensemble (Logistic Regression + LightGBM) for attribute: lack_firmness
Training Ensemble (Logistic Regression + LightGBM) for attribute: lack_radiance
Training Ensemble (Logistic Regression + LightGBM) for attribute: pores
Training Ensemble (Logistic Regression + LightGBM) for attribute: fine_lines
Training Ensemble (Logistic Regression + LightGBM) for attribute: wrinkles_fine-lines
Training Ensemble (Logistic Regression + LightGBM) for attribute: eye-wrinkles
Training Ensemble (Logistic Regression + LightGBM) for attribute: undereye-bags
Training Ensemble (Logistic Regression + LightGBM) for attribute: generic
Training Ensemble (Logistic Regression + LightGBM) for attrib

### **Logistic Regression + XGBoost**

In [6]:

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from codecarbon import EmissionsTracker
import logging
import os
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress scikit-learn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker (allow multiple runs)
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Enhanced Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords and lemmatize
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF with character n-grams
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))  # Increased max_features and ngram_range
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression + XGBoost ensemble for each binary attribute
results_ensemble = []
y_pred_all = []  # To store predictions for all attributes

for i, col in enumerate(binary_columns):
    print(f"Training Ensemble (Logistic Regression + XGBoost) for attribute: {col}")

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train[col])

    # Define Logistic Regression model with hyperparameter tuning
    lr_model = LogisticRegression(
        class_weight="balanced",  # Handle imbalanced classes
        max_iter=1000,           # Increase iterations for convergence
        n_jobs=-1,               # Use all cores
        random_state=42,
        C=0.1,                   # Regularization parameter
        solver='liblinear'       # Solver for better performance on small datasets
    )

    # Define XGBoost model with hyperparameter tuning
    xgb_model = XGBClassifier(
        n_estimators=200,  # Increased number of trees
        learning_rate=0.05,  # Lower learning rate for better generalization
        max_depth=7,        # Increased depth for more complex models
        random_state=42,
        n_jobs=-1,          # Use all available CPU cores
        scale_pos_weight=(len(y_train_res) - sum(y_train_res)) / sum(y_train_res),  # Handle class imbalance
        subsample=0.9,  # Subsample to reduce computation
        colsample_bytree=0.9  # Feature subsampling to reduce computation
    )

    # Create Voting Classifier (ensemble of Logistic Regression and XGBoost)
    ensemble_model = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('xgb', xgb_model)
        ],
        voting='soft',  # Use soft voting for probabilistic predictions
        n_jobs=-1       # Use all cores
    )

    # Train the ensemble model
    ensemble_model.fit(X_train_res, y_train_res)

    # Predict on test set
    y_pred = ensemble_model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)  # Store predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_ensemble.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_ensemble_df = pd.DataFrame(results_ensemble, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_ensemble = results_ensemble_df["Accuracy"].mean()
avg_f1_ensemble = results_ensemble_df["F1"].mean()
avg_recall_ensemble = results_ensemble_df["Recall"].mean()
avg_precision_ensemble = results_ensemble_df["Precision"].mean()

# Add average row to the results table
results_ensemble_df.loc["Average"] = ["Average", avg_accuracy_ensemble, avg_f1_ensemble, avg_recall_ensemble, avg_precision_ensemble]

# Display results
print("Ensemble (Logistic Regression + XGBoost) Results:")
print(results_ensemble_df)

# Convert predictions to a 2D array (num_samples, num_attributes)
y_pred_all = np.array(y_pred_all).T

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, y_pred_all, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions = tracker.stop()
if emissions is None:
    emissions = 0.0  # Default value if tracker fails

print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training Ensemble (Logistic Regression + XGBoost) for attribute: dark_pigmentation
Training Ensemble (Logistic Regression + XGBoost) for attribute: acne
Training Ensemble (Logistic Regression + XGBoost) for attribute: eye_contour
Training Ensemble (Logistic Regression + XGBoost) for attribute: homogeneity
Training Ensemble (Logistic Regression + XGBoost) for attribute: lack_firmness
Training Ensemble (Logistic Regression + XGBoost) for attribute: lack_radiance
Training Ensemble (Logistic Regression + XGBoost) for attribute: pores
Training Ensemble (Logistic Regression + XGBoost) for attribute: fine_lines
Training Ensemble (Logistic Regression + XGBoost) for attribute: wrinkles_fine-lines
Training Ensemble (Logistic Regression + XGBoost) for attribute: eye-wrinkles
Training Ensemble (Logistic Regression + XGBoost) for attribute: undereye-bags
Training Ensemble (Logistic Regression + XGBoost) for attribute: generic
Training Ensemble (Logistic Regression + XGBoost) for attribute: 18-34
Tr

### **Random Forest **

In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from codecarbon import EmissionsTracker
import logging
import os
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Delete the lock file if it exists
lock_file = "/tmp/.codecarbon.lock"
if os.path.exists(lock_file):
    os.remove(lock_file)

# Initialize CodeCarbon tracker
tracker = EmissionsTracker(log_level="error", allow_multiple_runs=True)
tracker.start()

# Load dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features and labels
X = df["text_raw"]
binary_columns = df.columns[1:34]
y = df[binary_columns]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest for each attribute
results_rf = []
y_pred_all = []

for col in binary_columns:
    print(f"Training Random Forest for attribute: {col}")

    class_weights = {0: 1, 1: len(y_train[col]) / sum(y_train[col])}

    model = RandomForestClassifier(
        n_estimators=150,  # Reduce number of trees for efficiency
        max_depth=10,  # Moderate depth to balance accuracy and emissions
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight=class_weights,
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_train_tfidf, y_train[col])

    y_pred = model.predict(X_test_tfidf)
    y_pred_all.append(y_pred)

    accuracy = accuracy_score(y_test[col], y_pred)
    f1 = f1_score(y_test[col], y_pred, average='binary')
    recall = recall_score(y_test[col], y_pred, average='binary')
    precision = precision_score(y_test[col], y_pred, average='binary')

    results_rf.append([col, accuracy, f1, recall, precision])

# Convert results to DataFrame
results_rf_df = pd.DataFrame(results_rf, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Compute averages
avg_metrics = results_rf_df.mean(numeric_only=True)
results_rf_df.loc["Average"] = ["Average"] + avg_metrics.tolist()

print("Random Forest Results:")
print(results_rf_df)

# Convert predictions to 2D array
y_pred_all = np.array(y_pred_all).T

# Generate classification report
classification_report_result = classification_report(y_test, y_pred_all, target_names=binary_columns, output_dict=True)

print("\nClassification Report Averages:")
for avg_type in ["micro avg", "macro avg", "weighted avg", "samples avg"]:
    avg = classification_report_result.get(avg_type, {})
    print(f"{avg_type.capitalize()} - Precision: {avg.get('precision', 0):.2f}, Recall: {avg.get('recall', 0):.2f}, F1: {avg.get('f1-score', 0):.2f}")

# Stop emissions tracker
emissions = tracker.stop() or 0.0
print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")




Training Random Forest for attribute: dark_pigmentation
Training Random Forest for attribute: acne
Training Random Forest for attribute: eye_contour
Training Random Forest for attribute: homogeneity
Training Random Forest for attribute: lack_firmness
Training Random Forest for attribute: lack_radiance
Training Random Forest for attribute: pores
Training Random Forest for attribute: fine_lines
Training Random Forest for attribute: wrinkles_fine-lines
Training Random Forest for attribute: eye-wrinkles
Training Random Forest for attribute: undereye-bags
Training Random Forest for attribute: generic
Training Random Forest for attribute: 18-34
Training Random Forest for attribute: 35-54
Training Random Forest for attribute: 55-99
Training Random Forest for attribute: dry
Training Random Forest for attribute: normal
Training Random Forest for attribute: oily
Training Random Forest for attribute: combination
Training Random Forest for attribute: sensitivity-high
Training Random Forest for att

### **RoBERTa + LightGBM**

In [4]:

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
import torch
from torch import nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import lightgbm as lgb
from sklearn.multioutput import MultiOutputClassifier
from codecarbon import EmissionsTracker
import logging

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Initialize CodeCarbon tracker (silent mode)
tracker = EmissionsTracker(log_level="error")  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Clean the 'text_raw' column
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the data using RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_data(texts, labels):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128)
    encodings["labels"] = labels.values.tolist()  # Add labels to the encodings
    return encodings

train_encodings = tokenize_data(X_train, y_train)
test_encodings = tokenize_data(X_test, y_test)

# Convert text to TF-IDF features for LightGBM
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Create PyTorch Dataset
class SkinConditionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = SkinConditionDataset(train_encodings)
test_dataset = SkinConditionDataset(test_encodings)

# Custom Trainer for Multi-Label Classification
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(binary_columns))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    report_to="none",
)

# Define Trainer
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the RoBERTa model
trainer.train()

# Evaluate RoBERTa model
roberta_predictions = trainer.predict(test_dataset)
roberta_preds = (torch.sigmoid(torch.tensor(roberta_predictions.predictions)) > 0.5).int()

# Train LightGBM model
lgb_model = MultiOutputClassifier(lgb.LGBMClassifier())
lgb_model.fit(X_train_tfidf, y_train)

# Evaluate LightGBM model
lgb_preds = lgb_model.predict(X_test_tfidf)

# Combine predictions (70% RoBERTa, 30% LightGBM)
roberta_weight = 0.7
lgb_weight = 0.3
combined_preds = (roberta_weight * roberta_preds.numpy() + lgb_weight * lgb_preds) > 0.5

# Calculate metrics for each attribute
results_combined = []
for i, col in enumerate(binary_columns):
    accuracy = accuracy_score(y_test[col], combined_preds[:, i])
    f1 = f1_score(y_test[col], combined_preds[:, i], average='binary')
    recall = recall_score(y_test[col], combined_preds[:, i], average='binary')
    precision = precision_score(y_test[col], combined_preds[:, i], average='binary')
    results_combined.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_combined_df = pd.DataFrame(results_combined, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_combined = results_combined_df["Accuracy"].mean()
avg_f1_combined = results_combined_df["F1"].mean()
avg_recall_combined = results_combined_df["Recall"].mean()
avg_precision_combined = results_combined_df["Precision"].mean()

# Add average row to the results table
results_combined_df.loc["Average"] = ["Average", avg_accuracy_combined, avg_f1_combined, avg_recall_combined, avg_precision_combined]

# Display results
print("Combined Results (RoBERTa + LightGBM):")
print(results_combined_df)

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, combined_preds, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions: float = tracker.stop()
print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4467,0.416291
2,0.3506,0.327864
3,0.2941,0.293253
4,0.2476,0.274837
5,0.2304,0.259873
6,0.2065,0.255068
7,0.1863,0.248585
8,0.1737,0.24533
9,0.1565,0.245061
10,0.1519,0.243642


[LightGBM] [Info] Number of positive: 592, number of negative: 4400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86181
[LightGBM] [Info] Number of data points in the train set: 4992, number of used features: 2117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118590 -> initscore=-2.005853
[LightGBM] [Info] Start training from score -2.005853
[LightGBM] [Info] Number of positive: 534, number of negative: 4458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082285 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 86181
[LightGBM] [Info] Number of data points in the train set: 4992, number of used features: 2117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.106971 -> initscore=-2.122060
[Lig

### **RoBERTa model**

In [3]:


import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
import torch
from torch import nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from codecarbon import EmissionsTracker
import logging

# Disable CodeCarbon logs
logging.getLogger("codecarbon").setLevel(logging.WARNING)

# Initialize CodeCarbon tracker (silent mode)
tracker = EmissionsTracker(log_level="error")  # Suppress all logs except errors
tracker.start()

# Load the dataset
file_path = "/content/anthropic.claude-3-5-sonnet Full.xlsx"
df = pd.read_excel(file_path)

# Clean the 'text_raw' column
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_raw"] = df["text_raw"].apply(clean_text)

# Features: 'text_raw' column
X = df["text_raw"]

# Labels: Binary attributes (columns 1 to 33)
binary_columns = df.columns[1:34]  # Assuming columns 1 to 33 are binary attributes
y = df[binary_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the data using RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_data(texts, labels):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128)
    encodings["labels"] = labels.values.tolist()  # Add labels to the encodings
    return encodings

train_encodings = tokenize_data(X_train, y_train)
test_encodings = tokenize_data(X_test, y_test)

# Create PyTorch Dataset
class SkinConditionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = SkinConditionDataset(train_encodings)
test_dataset = SkinConditionDataset(test_encodings)

# Custom Trainer for Multi-Label Classification
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(binary_columns))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    report_to="none",
)

# Define Trainer
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the RoBERTa model
trainer.train()

# Evaluate RoBERTa model
roberta_predictions = trainer.predict(test_dataset)
roberta_preds = (torch.sigmoid(torch.tensor(roberta_predictions.predictions)) > 0.5).int()

# Calculate metrics for each attribute
results_roberta = []
for i, col in enumerate(binary_columns):
    accuracy = accuracy_score(y_test[col], roberta_preds[:, i])
    f1 = f1_score(y_test[col], roberta_preds[:, i], average='binary')
    recall = recall_score(y_test[col], roberta_preds[:, i], average='binary')
    precision = precision_score(y_test[col], roberta_preds[:, i], average='binary')
    results_roberta.append([col, accuracy, f1, recall, precision])

# Convert results to a DataFrame
results_roberta_df = pd.DataFrame(results_roberta, columns=["Attribute", "Accuracy", "F1", "Recall", "Precision"])

# Calculate average metrics
avg_accuracy_roberta = results_roberta_df["Accuracy"].mean()
avg_f1_roberta = results_roberta_df["F1"].mean()
avg_recall_roberta = results_roberta_df["Recall"].mean()
avg_precision_roberta = results_roberta_df["Precision"].mean()

# Add average row to the results table
results_roberta_df.loc["Average"] = ["Average", avg_accuracy_roberta, avg_f1_roberta, avg_recall_roberta, avg_precision_roberta]

# Display results
print("RoBERTa Results:")
print(results_roberta_df)

# Generate classification report for micro, macro, weighted, and samples averages
classification_report_result = classification_report(
    y_test, roberta_preds, target_names=binary_columns, output_dict=True
)

# Extract micro, macro, weighted, and samples averages
micro_avg = classification_report_result['micro avg']
macro_avg = classification_report_result['macro avg']
weighted_avg = classification_report_result['weighted avg']
samples_avg = classification_report_result['samples avg']

# Display the averages
print("\nMicro Average:")
print(f"Precision: {micro_avg['precision']:.2f}, Recall: {micro_avg['recall']:.2f}, F1: {micro_avg['f1-score']:.2f}")

print("\nMacro Average:")
print(f"Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1: {macro_avg['f1-score']:.2f}")

print("\nWeighted Average:")
print(f"Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1: {weighted_avg['f1-score']:.2f}")

print("\nSamples Average:")
print(f"Precision: {samples_avg['precision']:.2f}, Recall: {samples_avg['recall']:.2f}, F1: {samples_avg['f1-score']:.2f}")

# Stop the CodeCarbon tracker and get the emissions
emissions: float = tracker.stop()
print(f"\nTotal Carbon Emissions: {emissions:.4f} kg CO2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4263,0.394789
2,0.3362,0.316574
3,0.2881,0.279294
4,0.237,0.271155
5,0.2256,0.259835
6,0.1986,0.253134
7,0.1791,0.246173
8,0.1625,0.243497
9,0.1488,0.242966
10,0.1412,0.242257


RoBERTa Results:
                   Attribute  Accuracy        F1    Recall  Precision
0          dark_pigmentation  0.959135  0.832787  0.783951   0.888112
1                       acne  0.981571  0.909804  0.892308   0.928000
2                eye_contour  0.983974  0.928058  0.889655   0.969925
3                homogeneity  0.947917  0.798762  0.821656   0.777108
4              lack_firmness  0.927083  0.787879  0.757848   0.820388
5              lack_radiance  0.915064  0.875000  0.868852   0.881235
6                      pores  0.948718  0.883212  0.883212   0.883212
7                 fine_lines  0.931891  0.891443  0.906494   0.876884
8        wrinkles_fine-lines  0.936699  0.910734  0.897550   0.924312
9               eye-wrinkles  0.980769  0.902439  0.895161   0.909836
10             undereye-bags  0.977564  0.882353  0.937500   0.833333
11                   generic  0.794872  0.835476  0.883152   0.792683
12                     18-34  0.827724  0.852639  0.866295   0.839406
13 