In [1]:
#To do: use accelerator to distribute model in multiple GPUs

from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
import pandas as pd
import os
from datetime import datetime
import json
from accelerate import Accelerator
from accelerate.utils import set_seed

accelerator = Accelerator()
set_seed(42)
device = accelerator.device
print(f"Using device: {device}")

Using device: cuda


In [2]:
# model_path = "./pretrained_llms/Llama-3.3-70B-Instruct"
# model_name = "Llama-3.1-8B-Instruct"
model_name = "Llama-3.3-70B-Instruct"

model_path = os.path.join("./pretrained_llms", model_name)
data_path = "./data"
data_name = "mteb/tweet_sentiment_extraction"
cache_dir = "./cache"
output_dir="./results"

dataset = load_dataset(data_name, cache_dir=data_path, split='train'[:200])
# dataset = load_dataset(data_name, cache_dir=data_path, split='train')
# dataset = load_dataset(data_name, cache_dir=data_path, split='train', remove_columns=["id"])

In [3]:
def analyze_sentiment_zero_shot(dataset, model, tokenizer, device, verbose=False, num_samples=None, 
                              checkpoint_interval=100, model_name="model", data_name="data", 
                              cache_dir="./cache"):
    """
    Analyze sentiment using zero-shot learning with checkpoint saving and resumption capabilities.
    
    Args:
        dataset: Dataset containing texts to analyze
        model: The language model
        tokenizer: The tokenizer
        device: Device to run the model on
        verbose: If True, prints each text and its prediction
        num_samples: Optional number of samples to process (None for all)
        checkpoint_interval: Number of items to process before saving checkpoint
        model_name: Name of the model for checkpoint filename
        data_name: Name of the dataset for checkpoint filename
        cache_dir: Directory to save checkpoints
    
    Returns:
        list: List of dictionaries containing text and predictions
    """
    model.eval()
    # model = model.to(device)
    
    # Create cache directory if it doesn't exist
    os.makedirs(cache_dir, exist_ok=True)
    
    # Generate checkpoint filename
    dataset_prefix = data_name.split('/')[-1][:5]
    checkpoint_filename = f"checkpoint_{model_name}_{dataset_prefix}.json"
    checkpoint_path = os.path.join(cache_dir, checkpoint_filename)
    
    # Initialize or load checkpoint
    start_idx = 0
    results = []
    
    if os.path.exists(checkpoint_path):
        try:
            with open(checkpoint_path, 'r') as f:
                checkpoint_data = json.load(f)
                results = checkpoint_data['results']
                start_idx = checkpoint_data['next_idx']
                print(f"Resuming from checkpoint at index {start_idx}")
        except Exception as e:
            print(f"Error loading checkpoint: {e}")
            print("Starting from beginning...")
    
    prompt_template = """[INST] Analyze the sentiment of the following text. Respond with exactly one word: either 'positive', 'negative', or 'neutral'.

Text: "{}"

Sentiment: [/INST]"""
    
    # Handle num_samples
    texts = dataset['text']
    if num_samples is not None:
        texts = texts[:num_samples]
    
    total = len(texts)
    
    def save_checkpoint(current_idx, current_results):
        checkpoint_data = {
            'next_idx': current_idx + 1,
            'results': current_results,
            'total': total,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        with open(checkpoint_path, 'w') as f:
            json.dump(checkpoint_data, f, indent=4)
        if verbose:
            print(f"\nCheckpoint saved at index {current_idx}")
    
    try:
        for i in range(start_idx, total):
            text = texts[i]
            prompt = prompt_template.format(text)
            # inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
            inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}            
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=20,
                    num_return_sequences=1,
                    temperature=0.1,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
                
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            sentiment = response[len(prompt):].strip().split()[0].lower()
            
            # Store result
            results.append({
                'text': text,
                'predicted_sentiment': sentiment
            })
            
            # Verbose output
            if verbose:
                print(f"\nText [{i+1}/{total}]: {text}")
                print(f"Predicted sentiment: {sentiment}")
                
            # Print progress every 10% if not verbose
            elif (i + 1) % max(1, total // 10) == 0:
                print(f"Progress: {(i + 1) / total:.1%}")
            
            # Save checkpoint at intervals
            if (i + 1) % checkpoint_interval == 0:
                save_checkpoint(i, results)
                
    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Saving checkpoint...")
        save_checkpoint(i, results)
        raise
    except Exception as e:
        print(f"\nError occurred: {e}")
        print("Saving checkpoint...")
        save_checkpoint(i, results)
        raise
    
    # Process completed successfully, remove checkpoint file
    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
        print("Processing completed, checkpoint file removed")
    
    print("Analysis complete!")
    return results

In [None]:
def compare_with_baseline(dataset):
    # Load a pre-trained sentiment model
    baseline_classifier = pipeline(
        "sentiment-analysis",
        model="distilbert-base-uncased-finetuned-sst-2-english",
        device=device
    )
    
    texts = dataset['text']
    
    for text in texts:
        result = baseline_classifier(text)[0]
        print(f"\nText: {text}")
        print(f"Baseline model prediction: {result}")

In [4]:
def evaluate_predictions(dataset, results, verbose=False):
    
    correct = 0
    total = len(results)
    confusion_matrix = {
        'negative': {'negative': 0, 'neutral': 0, 'positive': 0},
        'neutral': {'negative': 0, 'neutral': 0, 'positive': 0},
        'positive': {'negative': 0, 'neutral': 0, 'positive': 0}
    }
    
    for i, result in enumerate(results):
        true_label = dataset[i]['label_text']
        # predicted = result['predicted_sentiment'].lower().strip()
        predicted = result['predicted_sentiment']        
        
        # Handle variations in predictions
        if 'positive' in predicted:
            predicted = 'positive'
        elif 'negative' in predicted:
            predicted = 'negative'
        elif 'neutral' in predicted:
            predicted = 'neutral'
        else:
            if verbose:
                print(f"\nText: {result['text']}")
            print(f"Warning: Unexpected prediction format in {i}th data: {predicted}")
            continue
            
        is_correct = true_label == predicted
        if is_correct:
            correct += 1
            
        # Update confusion matrix
        confusion_matrix[true_label][predicted] += 1
        
        if verbose:
            print(f"\nText: {result['text']}")
            print(f"True label: {true_label}")
            print(f"Predicted: {predicted}")
            print(f"Correct: {is_correct}")
    
    # Calculate metrics
    accuracy = correct / total if total > 0 else 0
    
    # Print detailed results
    print("\n=== Evaluation Results ===")
    print(f"Total samples: {total}")
    print(f"Correct predictions: {correct}")
    print(f"Accuracy: {accuracy:.2%}")
    
    # Print confusion matrix
    print("\n=== Confusion Matrix ===")
    print(f"{'True|Pred':<15}{'Negative':<10}{'Neutral':<10}{'Positive':<10}")
    for true_label in ['negative', 'neutral', 'positive']:
        row = confusion_matrix[true_label]
        print(f"{true_label.title():<15}{row['negative']:<10}{row['neutral']:<10}{row['positive']:<10}")        
        
    # Calculate per-class metrics
    print("\n=== Per-Class Metrics ===")
    for label in ['negative', 'neutral', 'positive']:
        true_pos = confusion_matrix[label][label]
        false_pos = sum(conf[label] for l, conf in confusion_matrix.items() if l != label)
        false_neg = sum(confusion_matrix[label].values()) - true_pos
        
        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"\n{label.title()} class:")
        print(f"Precision: {precision:.2%}")
        print(f"Recall: {recall:.2%}")
        print(f"F1-score: {f1:.2%}")
    
    return {
        'accuracy': accuracy,
        'confusion_matrix': confusion_matrix,
        'total_samples': total,
        'correct_predictions': correct
    }

In [None]:
def find_inconsistencies(dataset, results):
    inconsistent_data = []
    
    for i, result in enumerate(results):
        true_label = dataset[i]['label_text']
        predicted = result['predicted_sentiment'].lower().strip()
        
        # Handle variations in predictions
        if 'positive' in predicted:
            predicted = 'positive'
        elif 'negative' in predicted:
            predicted = 'negative'
        elif 'neutral' in predicted:
            predicted = 'neutral'
        else:
            print(f"Warning: Unexpected prediction format: {predicted}")
            continue
            
        # If prediction doesn't match ground truth, add to list
        if true_label != predicted:
            inconsistent_data.append({
                'Ground Truth': true_label,
                'Predicted': predicted,
                'Text': dataset[i]['text']
            })
    
    # Create DataFrame
    df_inconsistencies = pd.DataFrame(inconsistent_data)
    
    # If there are inconsistencies, display summary
    if len(df_inconsistencies) > 0:
        print(f"\nFound {len(df_inconsistencies)} inconsistencies out of {len(results)} samples")
        print(f"Inconsistency rate: {(len(df_inconsistencies)/len(results)):.2%}")
    else:
        print("\nNo inconsistencies found!")
    
    return df_inconsistencies

In [None]:
def analyze_inconsistencies(dataset, results):
    df = find_inconsistencies(dataset, results)
    
    # Add text length column
    df['Text Length'] = df['Text'].str.len()
    
    # Add mismatch type column
    df['Mismatch Type'] = df.apply(
        lambda x: 'Major' if (x['Ground Truth'] == 'positive' and x['Predicted'] == 'negative') or 
                            (x['Ground Truth'] == 'negative' and x['Predicted'] == 'positive') 
        else 'Minor', axis=1
    )
    
    # Print analysis
    print("\nMismatch Type Distribution:")
    print(df['Mismatch Type'].value_counts())
    
    print("\nAverage Text Length by Mismatch Type:")
    print(df.groupby('Mismatch Type')['Text Length'].mean())
    
    return df

In [None]:
def find_inconsistencies_with_styling(dataset, results):
    df = find_inconsistencies(dataset, results)
    
    # Add styling to make it more readable
    styled_df = df.style.set_properties(**{
        'background-color': 'lightgray',
        'border-color': 'black',
        'border-style': 'solid',
        'border-width': '1px',
        'text-align': 'left'
    })
    
    # Add color coding for different types of mismatches
    def highlight_mismatches(row):
        if row['Ground Truth'] == 'positive' and row['Predicted'] == 'negative':
            return ['background-color: #ffcdd2'] * 3  # Red for major mismatches
        elif row['Ground Truth'] == 'negative' and row['Predicted'] == 'positive':
            return ['background-color: #ffcdd2'] * 3
        else:
            return ['background-color: #fff9c4'] * 3  # Yellow for minor mismatches
    
    styled_df = styled_df.apply(highlight_mismatches, axis=1)
    
    return styled_df

In [5]:
def save_analysis_results(results, model_name, data_name, base_dir="./results"):
    """
    Save analysis results to CSV file with automatic filename generation and collision handling.
    
    Args:
        results: List of dictionaries containing analysis results
        model_name: Name of the model used
        data_name: Name of the dataset used
        base_dir: Directory to save results
    
    Returns:
        str: Path to the saved file
    """
    # Create results directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)
    
    # Create base filename
    dataset_prefix = data_name.split('/')[-1][:5]  # Take first 5 letters
    base_filename = f"before_{model_name}_{dataset_prefix}"
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    
    # Generate filename with collision handling
    filename = f"{base_filename}.csv"
    filepath = os.path.join(base_dir, filename)
    
    if os.path.exists(filepath):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{base_filename}_{timestamp}.csv"
        filepath = os.path.join(base_dir, filename)
    
    # Save to CSV
    df.to_csv(filepath, index=False)
    print(f"Results saved to: {filepath}")
    
    return filepath

def load_analysis_results(filepath):
    """
    Load analysis results from CSV file.
    
    Args:
        filepath: Path to the CSV file
    
    Returns:
        list: List of dictionaries containing analysis results
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Results file not found: {filepath}")
    
    df = pd.read_csv(filepath)
    results = df.to_dict('records')
    print(f"Loaded {len(results)} results from: {filepath}")
    
    return results

# Helper function to clean up checkpoints
def cleanup_checkpoints(cache_dir="./cache"):
    """Remove all checkpoint files from the cache directory."""
    if os.path.exists(cache_dir):
        for file in os.listdir(cache_dir):
            if file.startswith("checkpoint_") and file.endswith(".json"):
                os.remove(os.path.join(cache_dir, file))
        print("Checkpoints cleaned up")


def save_metrics(metrics, model_name, data_name, base_dir="./results"):
    """
    Save metrics to JSON file with automatic filename generation and collision handling.
    
    Args:
        metrics: Dictionary containing evaluation metrics
        model_name: Name of the model used
        data_name: Name of the dataset used
        base_dir: Directory to save results
    
    Returns:
        str: Path to the saved file
    """
    # Create results directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)
    
    # Create base filename
    dataset_prefix = data_name.split('/')[-1][:5]
    base_filename = f"before_{model_name}_{dataset_prefix}_metrics"
    
    # Generate filename with collision handling
    filename = f"{base_filename}.json"
    filepath = os.path.join(base_dir, filename)
    
    if os.path.exists(filepath):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{base_filename}_{timestamp}.json"
        filepath = os.path.join(base_dir, filename)
    
    # Save to JSON
    with open(filepath, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"Metrics saved to: {filepath}")
    
    return filepath

def load_metrics(filepath):
    """
    Load metrics from JSON file.
    
    Args:
        filepath: Path to the JSON file
    
    Returns:
        dict: Dictionary containing evaluation metrics
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Metrics file not found: {filepath}")
    
    with open(filepath, 'r') as f:
        metrics = json.load(f)
    print(f"Loaded metrics from: {filepath}")
    
    return metrics

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    torch_dtype="auto",
    cache_dir=cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    add_eos_token=True,    
    cache_dir=cache_dir
)
model = accelerator.prepare(model)

if tokenizer.pad_token_id is None:
    print("No pad token found, setting pad token to eos token")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "right"
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# print("=== Testing Raw Outputs ===")
# test_raw_outputs(dataset, model, tokenizer, device)

print("\n=== Testing with Instruction Prompts ===")
# results = analyze_sentiment_zero_shot(dataset, model, tokenizer, device)
# results = analyze_sentiment_zero_shot(dataset, model, tokenizer, device, verbose=False, num_samples=20)
try:
    results = analyze_sentiment_zero_shot(
        dataset=dataset,
        model=model,
        tokenizer=tokenizer,
        device=device,
        verbose=False,
        # num_samples=20,
        checkpoint_interval=100,
        model_name=model_name,
        data_name=data_name
    )
except KeyboardInterrupt:
    print("\nProcessing interrupted. You can resume later using the same function call.")
except Exception as e:
    print(f"\nAn error occurred: {e}")
    print("You can resume processing later using the same function call.")


# Save results
results_filepath = save_analysis_results(results, model_name, data_name)

# print("\n=== Comparing with Baseline Model ===")
# compare_with_baseline(dataset)

In [5]:
results = load_analysis_results("./results/before_test_Llama-3.1-8B-Instruct_tweet.csv")

Loaded 3534 results from: ./results/before_test_Llama-3.1-8B-Instruct_tweet.csv


In [8]:
metrics = evaluate_predictions(dataset, results)


=== Evaluation Results ===
Total samples: 3534
Correct predictions: 2241
Accuracy: 63.41%

=== Confusion Matrix ===
True|Pred      Negative  Neutral   Positive  
Negative       753       170       65        
Neutral        358       694       358       
Positive       63        234       794       

=== Per-Class Metrics ===

Negative class:
Precision: 64.14%
Recall: 76.21%
F1-score: 69.66%

Neutral class:
Precision: 63.21%
Recall: 49.22%
F1-score: 55.34%

Positive class:
Precision: 65.24%
Recall: 72.78%
F1-score: 68.80%


In [None]:
df = find_inconsistencies(dataset, results)
# Display full DataFrame (not truncated)
pd.set_option('display.max_colwidth', None)
print("\nInconsistent predictions:")
print(df)

In [None]:
# Usage with analysis:
df_analyzed = analyze_inconsistencies(dataset, results)

In [None]:
# Usage with styling:
styled_df = find_inconsistencies_with_styling(dataset, results)
display(styled_df)  # If in Jupyter notebook

In [None]:
# Save metrics
metrics_filepath = save_metrics(metrics, model_name, data_name)

# Later, load results and metrics
loaded_results = load_analysis_results(results_filepath)
loaded_metrics = load_metrics(metrics_filepath)

# Verify loaded data
print("\nVerifying loaded results...")
print(f"Number of original results: {len(results)}")
print(f"Number of loaded results: {len(loaded_results)}")

print("\nVerifying loaded metrics...")
print(f"Original accuracy: {metrics['accuracy']:.2%}")
print(f"Loaded accuracy: {loaded_metrics['accuracy']:.2%}")
