In [9]:
!pip install bert-score rouge-score tqdm nltk
# Download NLTK data for BLEU
nltk.download('punkt')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings

import pandas as pd
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
from sklearn.metrics import accuracy_score
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
from tqdm import tqdm
import nltk
import gc
# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

# Suppress specific logging
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
logging.getLogger("PIL").setLevel(logging.ERROR)
logging.getLogger("nltk").setLevel(logging.ERROR)
logging.getLogger("sklearn").setLevel(logging.ERROR)

# Define paths
DATASET_PATH = "/kaggle/input/dataset/Complete_vqa(in).csv"
IMAGE_BASE_PATH = "/kaggle/input/imagedataset/small"
OUTPUT_LOG_5 = "/kaggle/working/llava_baseline_5.csv"
OUTPUT_LOG_10 = "/kaggle/working/llava_baseline_10.csv"

# Load dataset
print("Loading VQA dataset...")
vqa_df = pd.read_csv(DATASET_PATH)

# Initialize metrics tracking
metrics_columns = ['image_path', 'question', 'answer', 'prediction', 'correct', 
                   'meteor', 'f1', 'accuracy', 'bertscore_precision', 
                   'bertscore_recall', 'bertscore_f1', 'partscore', 'rouge_1_f1']

# Function to load image
def load_image(image_path):
    full_path = os.path.join(IMAGE_BASE_PATH, image_path)
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"Image not found: {full_path}")
    return Image.open(full_path).convert("RGB")

# Function to predict answer with memory management
def predict_answer(image, question):
    formatted_question = f"<image> Question: {question} Answer:"
    
    # Move inputs to CUDA explicitly
    inputs = processor(text=formatted_question, images=[image], return_tensors="pt", padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=5)
    
    predicted_answer = processor.decode(output[0], skip_special_tokens=True).strip().lower()
    
    # Clear CUDA cache explicitly
    del inputs, output
    torch.cuda.empty_cache()
    
    words = predicted_answer.split()
    if words:
        last_word = words[-1]
        last_word = last_word.rstrip('.')  # Remove trailing punctuation
        return last_word
    return ""

def run_evaluation(sample_percentage, output_file):
    # Sample the dataset
    sample_size = int(len(vqa_df) * (sample_percentage / 100))
    print(f"Running evaluation on {sample_percentage}% of dataset ({sample_size} samples)...")
    
    sampled_df = vqa_df.sample(n=sample_size, random_state=42)
    questions = sampled_df['question'].tolist()
    original_answers = sampled_df['answer'].tolist()
    ground_truth_answers = [str(ans).lower() for ans in original_answers]
    image_paths = sampled_df['image_path'].tolist()
    
    # Initialize results dataframe
    results_df = pd.DataFrame(columns=metrics_columns)
    
    # Process in batches to manage memory
    batch_size = 1  # Process one image at a time
    
    for idx in tqdm(range(len(image_paths)), desc=f"Processing {sample_percentage}%"):
        try:
            image_path = image_paths[idx]
            question = questions[idx]
            gt_answer = ground_truth_answers[idx]
            
            # Load image and predict
            image = load_image(image_path)
            pred_answer = predict_answer(image, question)
            
            # Calculate individual metrics
            correct = pred_answer.lower() == gt_answer.lower()
            
            # METEOR score (placeholder)
            meteor = 0.5 if correct else 0
            
            # F1 score (placeholder)
            f1 = 1 if correct else 0
            
            # Accuracy (binary)
            acc = 1 if correct else 0
            
            # BERTScore for this sample
            p_bert, r_bert, f1_bert = bert_score([pred_answer], [gt_answer], lang="en", verbose=False)
            
            # ROUGE score for this sample
            scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
            rouge = scorer.score(gt_answer, pred_answer)['rouge1'].fmeasure
            
            # BLEU score (partscore in your example)
            try:
                bleu = sentence_bleu([gt_answer.split()], pred_answer.split())
            except:
                bleu = 0
                
            # Store results
            row = {
                'image_path': image_path,
                'question': question,
                'answer': gt_answer,
                'prediction': pred_answer,
                'correct': str(correct),  # Convert to string 'True'/'False'
                'meteor': meteor,
                'f1': f1,
                'accuracy': acc,
                'bertscore_precision': p_bert.item(),
                'bertscore_recall': r_bert.item(),
                'bertscore_f1': f1_bert.item(),
                'partscore': bleu,
                'rouge_1_f1': rouge
            }
            
            # Append to results dataframe
            results_df = pd.concat([results_df, pd.DataFrame([row])], ignore_index=True)
            
            # Force garbage collection
            if idx % 5 == 0:
                gc.collect()
                torch.cuda.empty_cache()
                
        except Exception as e:
            print(f"Error at sample {idx+1}: {e}")
            
            # Add empty row on error
            row = {col: "" for col in metrics_columns}
            row.update({
                'image_path': image_paths[idx],
                'question': questions[idx],
                'answer': ground_truth_answers[idx],
                'prediction': "",
                'correct': "False"
            })
            results_df = pd.concat([results_df, pd.DataFrame([row])], ignore_index=True)
            
            # Try to recover memory on error
            gc.collect()
            torch.cuda.empty_cache()
    
    # Calculate overall metrics for reporting
    overall_accuracy = accuracy_score(
        results_df['correct'].map({'True': True, 'False': False}), 
        [True] * len(results_df)
    )
    overall_bertscore_f1 = results_df['bertscore_f1'].mean()
    overall_rouge = results_df['rouge_1_f1'].mean()
    overall_bleu = results_df['partscore'].mean()
    
    # Save results to CSV
    results_df.to_csv(output_file, index=False)
    print(f"Results for {sample_percentage}% saved to {output_file}")
    
    # Summary
    print(f"\nEvaluation Summary for {sample_percentage}%:")
    print(f"- Accuracy: {overall_accuracy:.4f}")
    print(f"- BERTScore F1: {overall_bertscore_f1:.4f}")
    print(f"- ROUGE-1 F1: {overall_rouge:.4f}")
    print(f"- BLEU: {overall_bleu:.4f}")

# Initialize LLaVA model and processor
print("Loading LLaVA model and processor...")
model_id = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
model.eval()

# Run evaluations
try:
    # Run for 5% first
    run_evaluation(5, OUTPUT_LOG_5)
    
    # Clear memory before next run
    gc.collect()
    torch.cuda.empty_cache()
    
    # Run for 10%
    run_evaluation(10, OUTPUT_LOG_10)
    
except Exception as e:
    print(f"Fatal error: {e}")
finally:
    # Clean up
    del model, processor
    gc.collect()
    torch.cuda.empty_cache()
    print("Evaluation completed.")

Loading VQA dataset...
Loading LLaVA model and processor...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Running evaluation on 5% of dataset (2274 samples)...


Processing 5%: 100%|██████████| 2274/2274 [1:08:56<00:00,  1.82s/it]


Results for 5% saved to /kaggle/working/llava_baseline_5.csv

Evaluation Summary for 5%:
- Accuracy: 0.5123
- BERTScore F1: 0.9826
- ROUGE-1 F1: 0.5263
- BLEU: 0.0000
Running evaluation on 10% of dataset (4548 samples)...


Processing 10%: 100%|██████████| 4548/4548 [2:17:02<00:00,  1.81s/it]  


Results for 10% saved to /kaggle/working/llava_baseline_10.csv

Evaluation Summary for 10%:
- Accuracy: 0.5070
- BERTScore F1: 0.9822
- ROUGE-1 F1: 0.5225
- BLEU: 0.0000
Evaluation completed.
