In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from tqdm.auto import tqdm
import warnings
import logging
from typing import Dict, List, Optional

warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.basicConfig(level=logging.WARNING)
for logger in ["transformers", "pytorch_lightning"]:
    logging.getLogger(logger).setLevel(logging.ERROR)

print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"GPU Memory: {gpu_memory:.2f} GB")

In [None]:
class Config:
    def __init__(self):
        self.MODEL_NAME = "meta-llama/Llama-2-7b-hf"
        self.HF_TOKEN = "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"  
        
        self.TRAIN_PATH = 'wmt_dataset_combined/train_filtered.tsv'
        self.TEST_PATH = 'wmt_dataset_combined/test_filtered.tsv'
        
        self.MAX_NEW_TOKENS = 256
        self.TEMPERATURE = 0.7
        self.MAX_LENGTH = 256
        
        self.DIRS = {
            'OUTPUT_DIR': 'translation_outputs',
            'EVAL_DIR': 'fixed_eval_sets',
            'RESULTS_DIR': 'results'
        }
        
        self._create_directories()
        
        self.DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def _create_directories(self):
        """Create necessary directories if they don't exist."""
        for dir_path in self.DIRS.values():
            os.makedirs(dir_path, exist_ok=True)

config = Config()

In [None]:
def load_dataset(file_path: str, delimiter: str = '\t') -> pd.DataFrame:
    """Load and validate the dataset."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset file not found at: {file_path}")
    
    df = pd.read_csv(file_path, delimiter=delimiter)
    required_columns = ['source_de', 'target_en']
    
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    df = df.dropna(subset=required_columns)
    df = df.drop_duplicates(subset=required_columns)
    
    return df

def create_translation_prompt(text: str) -> str:
    """Create a formatted prompt for translation."""
    if not isinstance(text, str) or not text.strip():
        raise ValueError("Invalid input text")
    return f"Translate the following German text to English:\nGerman: {text.strip()}\nEnglish:"

In [None]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    config.MODEL_NAME,
    token=config.HF_TOKEN,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded successfully")

print("\nLoading model...")
model = AutoModelForCausalLM.from_pretrained(
    config.MODEL_NAME,
    token=config.HF_TOKEN,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print("Model loaded successfully")

In [None]:
test_data = load_dataset(config.TEST_PATH)
print(f"Total samples in test dataset: {len(test_data)}")

full_val_df, full_test_df = train_test_split(test_data, test_size=0.5, random_state=42)
print(f"\nAfter splitting full dataset:")
print(f"Full validation set size: {len(full_val_df)}")
print(f"Full test set size: {len(full_test_df)}")

full_val_df.to_csv(f'{config.DIRS["EVAL_DIR"]}/full_validation.csv', index=False)
full_test_df.to_csv(f'{config.DIRS["EVAL_DIR"]}/full_test.csv', index=False)

fixed_val_df = full_val_df.sample(n=200, random_state=42)
fixed_test_df = full_test_df.sample(n=200, random_state=42)

fixed_val_df.to_csv(f'{config.DIRS["EVAL_DIR"]}/fixed_validation_200.csv', index=False)
fixed_test_df.to_csv(f'{config.DIRS["EVAL_DIR"]}/fixed_test_200.csv', index=False)

print(f"\nFixed set sizes:")
print(f"Fixed validation set size: {len(fixed_val_df)}")
print(f"Fixed test set size: {len(fixed_test_df)}")

de_tokens = [len(tokenizer.encode(text)) for text in fixed_test_df['source_de']]
en_tokens = [len(tokenizer.encode(text)) for text in fixed_test_df['target_en']]

print("\nToken length statistics (based on fixed test set):")
print("\nGerman sentences:")
print(f"Average length: {np.mean(de_tokens):.1f}")
print(f"Max length: {max(de_tokens)}")
print(f"95th percentile: {np.percentile(de_tokens, 95):.1f}")

print("\nEnglish sentences:")
print(f"Average length: {np.mean(en_tokens):.1f}")
print(f"Max length: {max(en_tokens)}")
print(f"95th percentile: {np.percentile(en_tokens, 95):.1f}")

In [None]:
def evaluate_model(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    config: Config,
    model_name: str = "base",
    save_results: bool = True
) -> pd.DataFrame:
    """Evaluate model performance"""
    try:
        # Load fixed test set
        test_df = pd.read_csv(f'{config.DIRS["EVAL_DIR"]}/fixed_test_200.csv')  # Changed path
        results = []
        predictions = []
        
        # Load metrics
        metrics = {
            'sacrebleu': evaluate.load("sacrebleu"),
            'comet': evaluate.load("comet")
        }
        
        print(f"\nGenerating translations for {model_name} model...")
        for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
            try:
                prompt = create_translation_prompt(row['source_de'])
                inputs = tokenizer(
                    prompt,
                    return_tensors="pt",
                    truncation=True,
                    max_length=config.MAX_LENGTH
                ).to(model.device)
                
                with torch.no_grad():
                    generated = model.generate(
                        **inputs,
                        max_new_tokens=config.MAX_NEW_TOKENS,
                        temperature=config.TEMPERATURE,
                        do_sample=True,
                        pad_token_id=tokenizer.pad_token_id,
                        num_return_sequences=1
                    )
                
                translation = tokenizer.decode(
                    generated[0],
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True
                )
                translation = translation[len(prompt):].strip().split('\n')[0]
                
                comet_score = metrics['comet'].compute(
                    predictions=[translation],
                    sources=[row['source_de']],
                    references=[row['target_en']]
                )
                
                results.append({
                    'source_de': row['source_de'],
                    'target_en': row['target_en'],
                    'predicted_en': translation,
                    'comet_score': comet_score['scores'][0]
                })
                predictions.append(translation)
                
            except Exception as e:
                print(f"Error processing row: {str(e)}")
                continue
        
        if not results:
            raise ValueError("No successful translations generated")
        
        bleu_result = metrics['sacrebleu'].compute(
            predictions=predictions,
            references=[[ex['target_en']] for ex in results]
        )
        
        df = pd.DataFrame(results)
        df['bleu_score'] = bleu_result['score']
        avg_comet = df['comet_score'].mean()
        
        print(f"\nResults for {model_name} model:")
        print(f"BLEU Score: {bleu_result['score']:.2f}")
        print(f"Average COMET Score: {avg_comet:.2f}")
        
        if save_results:
            save_path = f'{config.DIRS["RESULTS_DIR"]}/{model_name}_model_results.csv'
            df.to_csv(save_path, index=False)
            print(f"\nResults saved to {save_path}")
        
        return df
    
    except Exception as e:
        raise RuntimeError(f"Error in evaluation: {str(e)}")

In [None]:
print("\nStarting base model evaluation...")
base_results = evaluate_model(model, tokenizer, config)

print("\nSample translations:")
print(base_results[['source_de', 'target_en', 'predicted_en']].head(3))