In [2]:
import json
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

torch.cuda.set_device(1)
device = torch.device("cuda:1")

def load_json_file(file_path):
    """Load and return JSON file content."""
    try:
        with open(file_path, 'r') as f:
            return json.load(f)
    except json.JSONDecodeError:
        print(f"Corrupted JSON file: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

def calculate_semantic_similarity(text1, text2, model):
    """Calculate semantic similarity between two texts using SBERT."""
    try:
        # Encode the texts
        embeddings = model.encode([text1, text2])
        # Calculate cosine similarity and convert to native Python float
        similarity = float(cosine_similarity([embeddings[0]], [embeddings[1]])[0][0])
        return similarity
    except Exception as e:
        print(f"Error calculating similarity: {str(e)}")
        print(f"Text1: {text1[:100]}...")
        print(f"Text2: {text2[:100]}...")
        return None

class NumpyEncoder(json.JSONEncoder):
    """Custom encoder for numpy types"""
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                          np.int16, np.int32, np.int64, np.uint8,
                          np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

def main():
    # Initialize SBERT model
    model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    
    # Define paths
    gt_dir = "/ephemeral/shashmi/posix_new_improved/Thesis/paraphrase_error_iuxray_variant"
    results_dir = "/ephemeral/shashmi/posix_new_improved/Thesis/paraphrase_error_posix_result_llava_1.6"
    
    # Store all similarities
    all_similarities = []
    
    # Get all result files
    result_files = [f for f in os.listdir(results_dir) if f.endswith('_variants_results.json')]
    print(f"Found {len(result_files)} result files")
    
    # Process each file
    for result_file in tqdm(result_files, desc="Processing files"):
        try:
            # Get corresponding GT file name
            gt_file = result_file.replace('_results.json', '.json')
            
            # Construct full paths
            gt_path = os.path.join(gt_dir, gt_file)
            result_path = os.path.join(results_dir, result_file)
            
            # Check if both files exist
            if not (os.path.exists(gt_path) and os.path.exists(result_path)):
                print(f"Missing file pair for {result_file}")
                continue
            
            # Load both files
            gt_data = load_json_file(gt_path)
            result_data = load_json_file(result_path)
            
            # Skip if either file is corrupted
            if gt_data is None or result_data is None:
                print(f"Skipping {result_file} due to corrupted data")
                continue
            
            # Get ground truth answer
            if 'answer' not in gt_data:
                print(f"No 'answer' field in {gt_file}")
                continue
                
            gt_answer = gt_data['answer']
            
            # Get generated responses
            if 'trace' not in result_data or 'responses' not in result_data['trace']:
                print(f"Missing response data in {result_file}")
                continue
                
            responses = result_data['trace']['responses'][0]
            
            # Calculate similarities for each response
            similarities = []
            for response in responses:
                similarity = calculate_semantic_similarity(gt_answer, response, model)
                if similarity is not None:
                    similarities.append(similarity)
            
            # Skip if no valid similarities were calculated
            if not similarities:
                print(f"No valid similarities calculated for {result_file}")
                continue
            
            # Calculate average similarity for this question
            avg_similarity = float(np.mean(similarities))
            
            # Store results
            file_results = {
                'file_name': gt_file,
                'ground_truth': gt_answer,
                'average_similarity': avg_similarity,
                'individual_similarities': [float(s) for s in similarities],
                'responses': responses
            }
            
            all_similarities.append(file_results)
            
        except Exception as e:
            print(f"Error processing {result_file}: {str(e)}")
    
    # Check if we have any results
    if not all_similarities:
        print("No valid results were collected!")
        return None, None
    
    print(f"Successfully processed {len(all_similarities)} files")
    
    # Convert to DataFrame for analysis
    df = pd.DataFrame(all_similarities)
    
    # Calculate overall statistics
    overall_stats = {
        'mean_similarity': float(df['average_similarity'].mean()),
        'std_similarity': float(df['average_similarity'].std()),
        'median_similarity': float(df['average_similarity'].median()),
        'min_similarity': float(df['average_similarity'].min()),
        'max_similarity': float(df['average_similarity'].max())
    }
    
    # Save results
    df.to_csv('semantic_similarity_results.csv', index=False)
    
    # Save detailed results as JSON
    with open('semantic_similarity_detailed.json', 'w') as f:
        json.dump({
            'file_results': all_similarities,
            'overall_statistics': overall_stats
        }, f, indent=4, cls=NumpyEncoder)
    
    return df, overall_stats

if __name__ == "__main__":
    df, stats = main()
    if stats is not None:
        print("\nOverall Statistics:")
        for key, value in stats.items():
            print(f"{key}: {value:.4f}")
    else:
        print("Script completed with errors - no statistics available")

Found 0 result files


Processing files: 0it [00:00, ?it/s]

No valid results were collected!
Script completed with errors - no statistics available





In [3]:
import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
torch.cuda.set_device(1)
device = torch.device("cuda:1")
def load_and_match_files(ground_truth_dir, results_dir):
    """
    Load and match corresponding ground truth and results files.
    """
    pairs = []
    gt_files = [f for f in os.listdir(ground_truth_dir) if f.endswith('_variants.json')]
    
    for gt_file in gt_files:
        results_file = gt_file.replace('.json', '_results_results.json')
        if os.path.exists(os.path.join(results_dir, results_file)):
            pairs.append((
                os.path.join(ground_truth_dir, gt_file),
                os.path.join(results_dir, results_file)
            ))
    
    return pairs

def calculate_semantic_similarity(model, text1, text2):
    """
    Calculate semantic similarity between two texts using sentence transformers.
    """
    # Get embeddings and move to CPU before converting to numpy
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    
    # Move tensors to CPU and convert to numpy
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()
    
    # Calculate cosine similarity
    similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
    return float(similarity)

def analyze_semantic_similarity():
    # Paths
    ground_truth_dir = "/ephemeral/shashmi/posix_new_improved/Thesis/paraphrase_error_iuxray_variant"
    results_dir = "/ephemeral/shashmi/posix_new_improved/Thesis/paraphrase_error_posix_result_qwenvl"
    
    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2',device=device)
    if torch.cuda.is_available():
        model = model.to('cuda:1')
    
    # Get matched file pairs
    file_pairs = load_and_match_files(ground_truth_dir, results_dir)
    
    results = []
    error_files = []
    
    # Process each pair of files
    for gt_file, results_file in tqdm(file_pairs, desc="Processing files"):
        try:
            # Load files
            with open(gt_file, 'r') as f:
                gt_data = json.load(f)
            with open(results_file, 'r') as f:
                results_data = json.load(f)
            
            # Get ground truth answer and clean it
            gt_answer = gt_data['answer'].strip()
            
            # Get model responses and clean them
            model_responses = [resp.strip() for resp in results_data['trace']['responses'][0] if resp]
            
            # Calculate similarities
            similarities = []
            for response in model_responses:
                try:
                    sim = calculate_semantic_similarity(model, gt_answer, response)
                    similarities.append(sim)
                except Exception as e:
                    print(f"Error calculating similarity for response in {gt_file}: {str(e)}")
                    continue
            
            if similarities:  # Only add results if we got some valid similarities
                results.append({
                    'file_name': os.path.basename(gt_file),
                    'semantic_similarity': np.mean(similarities),
                    'min_similarity': np.min(similarities),
                    'max_similarity': np.max(similarities),
                    'std_similarity': np.std(similarities),
                    'num_responses': len(similarities)
                })
            
        except Exception as e:
            error_files.append((os.path.basename(gt_file), str(e)))
            continue
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Calculate overall statistics
    if not df.empty:
        overall_stats = {
            'mean_similarity': df['semantic_similarity'].mean(),
            'std_similarity': df['semantic_similarity'].std(),
            'min_similarity': df['semantic_similarity'].min(),
            'max_similarity': df['semantic_similarity'].max(),
            'total_files_processed': len(df),
            'total_files_failed': len(error_files)
        }
    else:
        overall_stats = {
            'error': 'No valid results generated',
            'total_files_failed': len(error_files)
        }
    
    # Save results
    df.to_csv('semantic_similarity_results.csv', index=False)
    
    # Save error log
    with open('error_log.json', 'w') as f:
        json.dump({'failed_files': error_files}, f, indent=4)
    
    # Save overall stats
    with open('semantic_similarity_stats.json', 'w') as f:
        json.dump(overall_stats, f, indent=4)
    
    return df, overall_stats, error_files

if __name__ == "__main__":
    df, stats, errors = analyze_semantic_similarity()
    print("\nOverall Statistics:")
    for key, value in stats.items():
        print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")
    
    print(f"\nNumber of failed files: {len(errors)}")
    print("See error_log.json for details of failed files")

Processing files: 100%|██████████| 400/400 [00:36<00:00, 10.94it/s]


Overall Statistics:
mean_similarity: 0.6348
std_similarity: 0.2136
min_similarity: -0.0340
max_similarity: 1.0000
total_files_processed: 399
total_files_failed: 1

Number of failed files: 1
See error_log.json for details of failed files



