# Ensemble Classifier

## 1. Setup and Imports


In [1]:
# Standard imports
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import json
import os
from tqdm import tqdm

# Model imports
from transformers import CamembertForSequenceClassification, CamembertTokenizer
from torchvision import models, transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
import joblib

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Local utilities
import sys
sys.path.append('..')
from utils.text_utils import text_pre_processing

## 2. Configuration

In [2]:
# Model paths
MODEL_PATHS = {
    'svm': '../models/svc_classifier.pkl',
    'tfidf': '../models/tfidfvectorizer_vectorizer.pkl',
    'vgg16': '../models/vgg16_transfer_model.pth',
    'bert': '../models/bert'
}

# Data paths
DATA_PATHS = {
    'localization': '../data/language_analysis/df_localization.csv',
    'images': '../data/processed/images/image_train_vgg16/val'
}

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


## 3. Data Preparation

In [3]:
# Load validation data with correct image paths
df = pd.read_csv('../data/language_analysis/df_localization.csv')
df["text"] = np.where(df["deepL_translation"].notna(), 
                     df["deepL_translation"],
                     df["merged_text"])

# Consistent label encoding and validation split
le = LabelEncoder()
df['prdtypecode_encoded'] = le.fit_transform(df['prdtypecode'])

from sklearn.model_selection import train_test_split
_, df_val = train_test_split(df, random_state=42, 
                            stratify=df['prdtypecode_encoded'], 
                            test_size=0.2)

# Build image mapping (the key function we discovered)
def build_image_path_mapping(base_dir):
    image_mapping = {}
    for class_folder in os.listdir(base_dir):
        class_path = os.path.join(base_dir, class_folder)
        if os.path.isdir(class_path) and class_folder != '.DS_Store':
            for image_file in os.listdir(class_path):
                if image_file.endswith('.jpg'):
                    image_mapping[image_file] = os.path.join(class_path, image_file)
    return image_mapping

# Get image paths with _cpr suffix handling
val_image_mapping = build_image_path_mapping('../data/processed/images/image_train_vgg16/val')

def get_image_path(image_name):
    # Try with _cpr suffix
    processed_name = image_name.replace('.jpg', '_cpr.jpg')
    return val_image_mapping.get(processed_name, None)

df_val['image_path'] = df_val.apply(lambda row: get_image_path(f"image_{row['imageid']}_product_{row['productid']}.jpg"), axis=1)

# Keep only samples with available images
df_val_ready = df_val[df_val['image_path'].notna()].copy()

print(f"‚úÖ Final validation dataset: {len(df_val_ready)} samples ready for ensemble")

‚úÖ Final validation dataset: 3191 samples ready for ensemble


## 4. Load SVM Model

In [4]:
# Load SVM model and TF-IDF vectorizer
print("Loading SVM model...")
try:
    svm_model = joblib.load('../models/svc_classifier.pkl')
    tfidf_vectorizer = joblib.load('../models/tfidfvectorizer_vectorizer.pkl')
    print("‚úÖ SVM and TF-IDF loaded successfully")
    
    # Quick test on one sample
    test_text = df_val_ready.iloc[0]['text']
    test_tfidf = tfidf_vectorizer.transform([test_text])
    svm_pred = svm_model.predict(test_tfidf)[0]
    svm_prob = svm_model.predict_proba(test_tfidf)[0]
    
    print(f"SVM test prediction: {svm_pred}")
    print(f"SVM confidence: {svm_prob.max():.3f}")
    print(f"SVM output shape: {svm_prob.shape}")
    
except Exception as e:
    print(f"‚ùå Error loading SVM: {e}")

Loading SVM model...
‚úÖ SVM and TF-IDF loaded successfully
SVM test prediction: 1301
SVM confidence: 0.976
SVM output shape: (27,)


## 5. Load BERT Model

In [5]:
# Install SentencePiece
import subprocess
import sys

print("Installing SentencePiece...")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
    print("‚úÖ SentencePiece installed successfully")
    print("Note: You may need to restart your kernel after installation")
except Exception as e:
    print(f"‚ùå Installation error: {e}")

Installing SentencePiece...
‚úÖ SentencePiece installed successfully
Note: You may need to restart your kernel after installation



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
# Load BERT model
print("Loading BERT model...")
try:
    bert_model = CamembertForSequenceClassification.from_pretrained('../models/bert')
    bert_tokenizer = CamembertTokenizer.from_pretrained('../models/bert')
    bert_model.eval()
    bert_model.to(device)
    print("‚úÖ BERT loaded successfully")
    
    # Quick test on one sample
    test_text = df_val_ready.iloc[0]['text']
    text_inputs = bert_tokenizer(
        test_text, return_tensors='pt', padding='max_length',
        truncation=True, max_length=256
    )
    
    with torch.no_grad():
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        bert_outputs = bert_model(**text_inputs)
        bert_prob = F.softmax(bert_outputs.logits, dim=1).cpu().numpy()[0]
    
    bert_pred = np.argmax(bert_prob)
    print(f"BERT test prediction: {bert_pred}")
    print(f"BERT confidence: {bert_prob.max():.3f}")
    print(f"BERT output shape: {bert_prob.shape}")
    
except Exception as e:
    print(f"‚ùå Error loading BERT: {e}")

Loading BERT model...
‚úÖ BERT loaded successfully
BERT test prediction: 10
BERT confidence: 0.991
BERT output shape: (27,)


## 6. Load VGG16 Model

In [8]:
# Add missing import
from PIL import Image

# Now test VGG16 again
print("Testing VGG16 with correct imports...")
try:
    test_image_path = df_val_ready.iloc[0]['image_path']
    test_image = Image.open(test_image_path).convert("RGB")
    image_tensor = image_transform(test_image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        vgg16_logits = vgg16_model(image_tensor)
        vgg16_prob = F.softmax(vgg16_logits, dim=1).cpu().numpy()[0]
    
    vgg16_pred = np.argmax(vgg16_prob)
    print(f"VGG16 test prediction: {vgg16_pred}")
    print(f"VGG16 confidence: {vgg16_prob.max():.3f}")
    print(f"VGG16 output shape: {vgg16_prob.shape}")
    print("‚úÖ VGG16 working correctly!")
    
except Exception as e:
    print(f"‚ùå Error: {e}")

Testing VGG16 with correct imports...
VGG16 test prediction: 4
VGG16 confidence: 0.113
VGG16 output shape: (27,)
‚úÖ VGG16 working correctly!


## 7. Test ensemble + label encoding fix 

In [11]:
# Fix SVM predictions to use encoded labels
print("=== FIXING LABEL ENCODING CONSISTENCY ===")

# Convert SVM prediction from original to encoded
svm_pred_encoded = le.transform([svm_pred])[0]
print(f"SVM prediction conversion:")
print(f"  Original: {svm_pred}")
print(f"  Encoded: {svm_pred_encoded}")

# Now all predictions are in encoded format
print(f"\nAll predictions in encoded format:")
print(f"SVM:   {svm_pred_encoded}")
print(f"BERT:  {bert_pred}")  
print(f"VGG16: {vgg16_pred}")
print(f"True:  {true_encoded}")

# Check which models got it right (in encoded format)
print(f"\nCorrectness check:")
print(f"SVM correct:   {svm_pred_encoded == true_encoded} ‚úÖ")
print(f"BERT correct:  {bert_pred == true_encoded} ‚úÖ") 
print(f"VGG16 correct: {vgg16_pred == true_encoded} ‚ùå")

# Re-test ensemble with corrected predictions
print(f"\n=== CORRECTED ENSEMBLE TEST ===")

# Create corrected probability distributions
# For SVM: we need to rearrange probabilities to match encoded order
svm_prob_encoded = np.zeros(27)
for i, original_class in enumerate(le.classes_):
    encoded_idx = le.transform([original_class])[0]
    svm_prob_encoded[encoded_idx] = svm_prob[i]

print(f"SVM probability conversion successful")
print(f"SVM max prob before: {svm_prob.max():.3f}")
print(f"SVM max prob after: {svm_prob_encoded.max():.3f}")

# Now test ensemble with consistent encoding
for i, weights in enumerate(weight_configs):
    ensemble_prob = (weights['svm'] * svm_prob_encoded + 
                    weights['bert'] * bert_prob + 
                    weights['vgg16'] * vgg16_prob)
    
    ensemble_pred = np.argmax(ensemble_prob)
    ensemble_conf = ensemble_prob.max()
    
    correct = "‚úÖ" if ensemble_pred == true_encoded else "‚ùå"
    print(f"Config {i+1}: Class {ensemble_pred} (confidence: {ensemble_conf:.3f}) {correct}")

=== FIXING LABEL ENCODING CONSISTENCY ===
SVM prediction conversion:
  Original: 1301
  Encoded: 10

All predictions in encoded format:
SVM:   10
BERT:  10
VGG16: 4
True:  10

Correctness check:
SVM correct:   True ‚úÖ
BERT correct:  True ‚úÖ
VGG16 correct: False ‚ùå

=== CORRECTED ENSEMBLE TEST ===
SVM probability conversion successful
SVM max prob before: 0.976
SVM max prob after: 0.976
Config 1: Class 10 (confidence: 0.658) ‚úÖ
Config 2: Class 10 (confidence: 0.792) ‚úÖ
Config 3: Class 10 (confidence: 0.794) ‚úÖ


## 8. Full Ensemble Evaluation

### 8.1 Sample Evaluation to find best weights

In [None]:
# Full ensemble evaluation on 100 validation samples
print("=== FULL ENSEMBLE EVALUATION ===")
print(f"Evaluating on {len(df_val_ready)} validation samples...")

# Function to convert SVM probabilities to encoded format
def convert_svm_probabilities(svm_probs, label_encoder):
    """Convert SVM probabilities from original to encoded label order"""
    converted_probs = np.zeros_like(svm_probs)
    for i, original_class in enumerate(label_encoder.classes_):
        encoded_idx = label_encoder.transform([original_class])[0]
        converted_probs[:, encoded_idx] = svm_probs[:, i]
    return converted_probs

# Ensemble prediction function
def predict_ensemble_sample(text, image_path, weights):
    """Predict on a single sample with all three models"""
    
    # SVM prediction
    text_tfidf = tfidf_vectorizer.transform([text])
    svm_probs_orig = svm_model.predict_proba(text_tfidf)
    svm_probs = convert_svm_probabilities(svm_probs_orig, le)[0]
    
    # BERT prediction
    text_inputs = bert_tokenizer(text, return_tensors='pt', padding='max_length',
                                truncation=True, max_length=256)
    with torch.no_grad():
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        bert_outputs = bert_model(**text_inputs)
        bert_probs = F.softmax(bert_outputs.logits, dim=1).cpu().numpy()[0]
    
    # VGG16 prediction
    image = Image.open(image_path).convert("RGB")
    image_tensor = image_transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        vgg16_logits = vgg16_model(image_tensor)
        vgg16_probs = F.softmax(vgg16_logits, dim=1).cpu().numpy()[0]
    
    # Ensemble combination
    ensemble_probs = (weights['svm'] * svm_probs + 
                     weights['bert'] * bert_probs + 
                     weights['vgg16'] * vgg16_probs)
    
    return np.argmax(ensemble_probs)

# Test different weight configurations
weight_configurations = [
    {'svm': 0.33, 'bert': 0.33, 'vgg16': 0.34},  # Equal weights
    {'svm': 0.4, 'bert': 0.4, 'vgg16': 0.2},     # Text-heavy  
    {'svm': 0.3, 'bert': 0.5, 'vgg16': 0.2},     # BERT-heavy
    {'svm': 0.5, 'bert': 0.3, 'vgg16': 0.2},     # SVM-heavy
]

# Evaluate each configuration (use smaller sample for quick test)
sample_size = 100  # Start with 100 samples, remove this for full evaluation
test_samples = df_val_ready.sample(n=sample_size, random_state=42)

print(f"Testing on {len(test_samples)} samples...")

best_f1 = 0
best_config = None
results = {}

for i, weights in enumerate(weight_configurations):
    print(f"\n--- Configuration {i+1}: {weights} ---")
    
    predictions = []
    true_labels = []
    
    for idx, row in tqdm(test_samples.iterrows(), total=len(test_samples), desc=f"Config {i+1}"):
        try:
            pred = predict_ensemble_sample(row['text'], row['image_path'], weights)
            predictions.append(pred)
            true_labels.append(row['prdtypecode_encoded'])
        except Exception as e:
            print(f"Error on sample {idx}: {e}")
            continue
    
    # Calculate F1 score
    f1_weighted = f1_score(true_labels, predictions, average='weighted')
    f1_macro = f1_score(true_labels, predictions, average='macro')
    
    config_name = f"svm_{weights['svm']}_bert_{weights['bert']}_vgg16_{weights['vgg16']}"
    results[config_name] = {
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro,
        'weights': weights,
        'samples_evaluated': len(predictions)
    }
    
    print(f"F1 Weighted: {f1_weighted:.4f}")
    print(f"F1 Macro: {f1_macro:.4f}")
    
    if f1_weighted > best_f1:
        best_f1 = f1_weighted
        best_config = weights

print(f"\n=== BEST ENSEMBLE RESULTS ===")
print(f"Best F1 Score: {best_f1:.4f}")
print(f"Best Configuration: {best_config}")
print(f"Individual baselines: SVM=0.763, BERT=0.863, VGG16=0.518")
print(f"Ensemble improvement: {best_f1 - 0.863:+.4f} vs best individual")

=== FULL ENSEMBLE EVALUATION ===
Evaluating on 3191 validation samples...
Testing on 100 samples...

--- Configuration 1: {'svm': 0.33, 'bert': 0.33, 'vgg16': 0.34} ---


Config 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:22<00:00,  1.22it/s]


F1 Weighted: 0.8367
F1 Macro: 0.7687

--- Configuration 2: {'svm': 0.4, 'bert': 0.4, 'vgg16': 0.2} ---


Config 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:36<00:00,  1.03it/s]


F1 Weighted: 0.8533
F1 Macro: 0.7861

--- Configuration 3: {'svm': 0.3, 'bert': 0.5, 'vgg16': 0.2} ---


Config 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:19<00:00,  1.27it/s]


F1 Weighted: 0.8522
F1 Macro: 0.7858

--- Configuration 4: {'svm': 0.5, 'bert': 0.3, 'vgg16': 0.2} ---


Config 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:18<00:00,  1.27it/s]

F1 Weighted: 0.8329
F1 Macro: 0.8067

=== BEST ENSEMBLE RESULTS ===
Best F1 Score: 0.8533
Best Configuration: {'svm': 0.4, 'bert': 0.4, 'vgg16': 0.2}
Individual baselines: SVM=0.763, BERT=0.863, VGG16=0.518
Ensemble improvement: -0.0097 vs best individual





### 8.2 Best Weighted Ensemble Evaluation on Full Evaluation Set

In [15]:
# Test only the best configuration on full validation set
import time

print("=== FULL VALIDATION TEST - BEST CONFIG ONLY ===")

best_weights = {'svm': 0.4, 'bert': 0.4, 'vgg16': 0.2}
print(f"Testing best config {best_weights} on all {len(df_val_ready)} samples...")

predictions = []
true_labels = []

# Progress tracking
start_time = time.time()
for idx, row in tqdm(df_val_ready.iterrows(), total=len(df_val_ready), desc="Full Validation"):
    try:
        pred = predict_ensemble_sample(row['text'], row['image_path'], best_weights)
        predictions.append(pred)
        true_labels.append(row['prdtypecode_encoded'])
    except Exception as e:
        print(f"Error on sample {idx}: {e}")
        continue

# Calculate final metrics
f1_weighted_full = f1_score(true_labels, predictions, average='weighted')
f1_macro_full = f1_score(true_labels, predictions, average='macro')

runtime = time.time() - start_time
print(f"\n=== FULL VALIDATION RESULTS ===")
print(f"Samples processed: {len(predictions)}/{len(df_val_ready)}")
print(f"Runtime: {runtime/60:.1f} minutes")
print(f"F1 Weighted (full): {f1_weighted_full:.4f}")
print(f"F1 Macro (full): {f1_macro_full:.4f}")

# Compare with individual baselines (with BERT caveat)
print(f"\n=== BASELINE COMPARISON ===")
print(f"Models on same validation split:")
print(f"  SVM:      0.763 ‚úÖ")
print(f"  VGG16:    0.518 ‚úÖ") 
print(f"  Ensemble: {f1_weighted_full:.4f} ‚úÖ")

print(f"\nModels on unknown validation split:")
print(f"  BERT:     0.863 ‚ö†Ô∏è (different split, possible leakage)")

print(f"\nValid conclusions:")
if f1_weighted_full > max(0.763, 0.518):
    print(f"‚úÖ Ensemble shows multimodal improvement over individual clean models")
    
print(f"‚ùì BERT comparison inconclusive due to validation split uncertainty")

=== FULL VALIDATION TEST - BEST CONFIG ONLY ===
Testing best config {'svm': 0.4, 'bert': 0.4, 'vgg16': 0.2} on all 3191 samples...


Full Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3191/3191 [1:01:49<00:00,  1.16s/it]  


=== FULL VALIDATION RESULTS ===
Samples processed: 3191/3191
Runtime: 61.8 minutes
F1 Weighted (full): 0.8727
F1 Macro (full): 0.8408

=== BASELINE COMPARISON ===
Models on same validation split:
  SVM:      0.763 ‚úÖ
  VGG16:    0.518 ‚úÖ
  Ensemble: 0.8727 ‚úÖ

Models on unknown validation split:
  BERT:     0.863 ‚ö†Ô∏è (different split, possible leakage)

Valid conclusions:
‚úÖ Ensemble shows multimodal improvement over individual clean models
‚ùì BERT comparison inconclusive due to validation split uncertainty





### 10. Save Results & Analysis

In [16]:
# Save final ensemble results
final_ensemble_results = {
    'ensemble_performance': {
        'f1_weighted': 0.8727,
        'f1_macro': 0.8408,
        'samples_evaluated': 3191,
        'runtime_minutes': 61.8
    },
    'best_configuration': {
        'svm_weight': 0.4,
        'bert_weight': 0.4, 
        'vgg16_weight': 0.2
    },
    'baseline_comparison': {
        'svm_f1': 0.763,
        'vgg16_f1': 0.518,
        'bert_f1_uncertain': 0.863,
        'ensemble_improvement_vs_svm': 0.8727 - 0.763,
        'ensemble_improvement_vs_vgg16': 0.8727 - 0.518
    },
    'benchmark_comparison': {
        'official_text_benchmark': 0.8113,
        'ensemble_vs_benchmark': 0.8727 - 0.8113,
        'percentage_improvement': ((0.8727 - 0.8113) / 0.8113) * 100
    }
}

# Save to results folder
import json
os.makedirs('../results', exist_ok=True)
with open('../results/ensemble_final_results.json', 'w') as f:
    json.dump(final_ensemble_results, f, indent=2)

print("‚úÖ Final ensemble results saved to ../results/ensemble_final_results.json")
print(f"üéØ ENSEMBLE SUCCESS: {final_ensemble_results['ensemble_performance']['f1_weighted']:.4f} F1 weighted")
print(f"üèÜ BEATS CHALLENGE BENCHMARK by {final_ensemble_results['benchmark_comparison']['ensemble_vs_benchmark']:.4f} points!")

‚úÖ Final ensemble results saved to ../results/ensemble_final_results.json
üéØ ENSEMBLE SUCCESS: 0.8727 F1 weighted
üèÜ BEATS CHALLENGE BENCHMARK by 0.0614 points!
