In [None]:
# =======================
# NATURAL TEXT INPUT
# =======================

import os
import json
import csv
import logging
import re
from collections import defaultdict
from typing import Dict, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

class Config:
    def __init__(self):
        # Directory containing the JSON files
        self.data_directory = './data/encoded/caesar-cipher/natural/'
        # Path to the configuration file
        self.config_file_path = './caesar-cipher-experiments-log-natural-text.txt'
        # CSV output file for aggregated metrics
        self.csv_file_result = 'caesar-cipher-natural-text-benchmark-result.csv'
        # Text file containing all details
        self.output_details_file = 'caesar-cipher-natural-text-benchmark-process.txt'

class CaesarCipherTest:
    def __init__(self, config: Config):
        self.config = config
        self.directory = self.config.data_directory
        self.config_file = self.config.config_file_path
        # Parse config file once for metadata
        self.config_data = self.parse_config_file()

        # Exponent for weighting sequences by their word length
        self.word_weight_exponent = 2

        # Accumulate final (file-level) results for the CSV
        self.file_level_results = []
        
        # Clear or create the output details file at the start
        with open(self.config.output_details_file, 'w', encoding='utf-8') as f:
            f.write("Comprehensive operation details for random text results:\n\n")

    def parse_config_file(self) -> Dict[str, Dict[str, str]]:
        config_data = {}
        try:
            with open(self.config_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        timestamp = self.extract_timestamp(line)
                        fields = self.parse_filename(line)
                        config_data[timestamp] = fields
        except FileNotFoundError:
            logging.error(f"Config file {self.config_file} not found.")
        return config_data

    @staticmethod
    def extract_timestamp(filename: str) -> str:
        """
        Example: 20230101_122345_some-other-stuff -> timestamp would be 20230101_122345
        """
        parts = filename.split('_')
        if len(parts) >= 2:
            return f"{parts[0]}_{parts[1]}"
        return parts[0]

    @staticmethod
    def parse_filename(filename: str) -> Dict[str, str]:
        """
        A robust approach that handles both Llama-like (__0.01_64) 
        and Mistral-like (_0.01_64) suffixes.
        """
        if filename.endswith('.json'):
            filename = filename[:-5]

        pattern = r'^(.*)_([\d\.]+)_(\d+)$'
        match = re.match(pattern, filename)
        if match:
            pre_model_info = match.group(1)
            temperature   = match.group(2)
            max_token     = match.group(3)
        else:
            pre_model_info = filename
            temperature    = ''
            max_token      = ''

        pre_fields = pre_model_info.split('_')

        date        = pre_fields[0] if len(pre_fields) > 0 else ''
        time        = pre_fields[1] if len(pre_fields) > 1 else ''
        samples     = pre_fields[2] if len(pre_fields) > 2 else ''
        shift       = pre_fields[3] if len(pre_fields) > 3 else ''
        prompt_type = '_'.join(pre_fields[4:6]) if len(pre_fields) > 5 else ''
        method      = pre_fields[6] if len(pre_fields) > 6 else ''

        shot_and_model_name_parts = pre_fields[7:]
        shot_and_model_name       = '_'.join(shot_and_model_name_parts)

        if '-' in shot_and_model_name:
            shot, model_name = shot_and_model_name.split('-', 1)
        else:
            shot = ''
            model_name = shot_and_model_name

        if model_name.startswith('models--'):
            model_name = model_name[len('models--'):]
            model_parts = model_name.split('--')
            model_name  = model_parts[-1]
        model_name = model_name.rstrip('_')

        return {
            'date': date,
            'time': time,
            'samples': samples,
            'shift': shift,
            'prompt_type': prompt_type,
            'method': method,
            'shot': shot,
            'model_name': model_name,
            'temperature': temperature,
            'max_token': max_token
        }

    @staticmethod
    def levenshtein_distance(s1: str, s2: str) -> int:
        """
        Compute the Levenshtein edit distance between two strings s1 and s2
        (minimum number of single-character edits).
        """
        if not s1:
            return len(s2)
        if not s2:
            return len(s1)

        len_s1 = len(s1)
        len_s2 = len(s2)

        dp = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

        for i in range(len_s1 + 1):
            dp[i][0] = i
        for j in range(len_s2 + 1):
            dp[0][j] = j

        for i in range(1, len_s1 + 1):
            for j in range(1, len_s2 + 1):
                cost = 0 if s1[i-1] == s2[j-1] else 1
                dp[i][j] = min(
                    dp[i-1][j] + 1,       # deletion
                    dp[i][j-1] + 1,       # insertion
                    dp[i-1][j-1] + cost   # substitution
                )

        return dp[len_s1][len_s2]

    def compute_per_record_metrics(self, cipher_text: str, gold_label: str) -> Dict[str, float]:
        """
        Compute per-record metrics:
          1) Levenshtein Error Rate: dist / len(gold_label)
          2) Character Error Rate:   1 - (# matched chars / len(gold_label))
          3) Word Accuracy (raw fraction for a single record)
          4) Sentence Accuracy: 1.0 if exact match else 0.0
        """
        gold_len = len(gold_label)
        
        # 1) Levenshtein Error Rate
        if gold_len > 0:
            dist = self.levenshtein_distance(cipher_text, gold_label)
            levenshtein_error_rate = dist / gold_len
            # Cap the Levenshtein Error Rate at 100 if it exceeds this threshold
            if levenshtein_error_rate > 1:
                levenshtein_error_rate = 1.0000
                dist = gold_len
        else:
            dist = 0
            levenshtein_error_rate = 0.0

        # 2) Character Error Rate
        matched_chars = 0
        for i in range(gold_len):
            if i < len(cipher_text) and cipher_text[i] == gold_label[i]:
                matched_chars += 1

        if gold_len > 0:
            character_error_rate = 1 - (matched_chars / gold_len)
        else:
            character_error_rate = 0.0

        # 3) Word Accuracy (unweighted, just per-record fraction)
        gold_words = gold_label.split()
        pred_words = cipher_text.split()
        matched_words = sum(gw == pw for gw, pw in zip(gold_words, pred_words))
        gold_words_len = len(gold_words)

        if gold_words_len > 0:
            word_accuracy = matched_words / gold_words_len
        else:
            word_accuracy = 0.0

        # 4) Sentence Accuracy
        sentence_accuracy = 1.0 if cipher_text == gold_label else 0.0

        return {
            'levenshtein_error_rate': levenshtein_error_rate,
            'character_error_rate': character_error_rate,
            'word_accuracy': word_accuracy,     # raw fraction for this record
            'sentence_accuracy': sentence_accuracy,
            # Additional for file-level weighting
            'levenshtein_distance': dist,
            'matched_chars': matched_chars,
            'gold_length': gold_len,
            'matched_words': matched_words,
            'gold_words_len': gold_words_len
        }

    def process_json_file(self, filepath: str, fields: Dict[str, str]) -> Optional[Dict[str, float]]:
        """
        Process a single JSON file:
         - For each record: compute per-record metrics and log them.
         - For file-level: 
             * Weighted word accuracy (using L^k).
             * Weighted Levenshtein and Character Error Rate.
             * Average Sentence Accuracy.
             * NEW: total matched words vs. total gold words => file-level average word accuracy.
        """
        filename = os.path.basename(filepath)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if not content:
                    logging.warning(f"Skipping empty file: {filename}")
                    return None
                data = json.loads(content)
        except (json.JSONDecodeError, FileNotFoundError):
            logging.error(f"Error reading or decoding JSON in file: {filename}")
            return None

        if not isinstance(data, list):
            logging.error(f"File {filename} does not contain a list of records.")
            return None

        # Accumulators for file-level (weighted or otherwise)
        total_levenshtein_distance = 0
        total_matched_chars = 0
        total_gold_chars = 0

        # Weighted Word Accuracy accumulators
        sum_weighted_word_contrib = 0.0
        sum_length_weights = 0.0

        # Sentence accuracy accumulators
        sum_sentence_accuracy = 0.0
        record_count = 0

        # accumulators for average word accuracy
        total_matched_words = 0
        total_gold_words = 0

        # Start logging to the details file
        with open(self.config.output_details_file, 'a', encoding='utf-8') as out_f:
            out_f.write(f"=== Processing file: {filename} ===\n")
            out_f.write(f"Model name: {fields.get('model_name', '')}, Shift: {fields.get('shift', '')}, "
                        f"Prompt Type: {fields.get('prompt_type', '')}, Temperature: {fields.get('temperature', '')}, "
                        f"Max Token: {fields.get('max_token', '')}\n\n")

            # k exponent for weighting word accuracy
            k = self.word_weight_exponent

            # Process each record
            for idx, record in enumerate(data, start=1):
                cipher_text = record.get("cipher_text", "").strip()
                gold_label = record.get("gold_label", "").strip()

                if not cipher_text and not gold_label:
                    continue

                metrics = self.compute_per_record_metrics(cipher_text, gold_label)

                # Basic record-level fraction
                record_word_acc = metrics['word_accuracy']
                L_i = metrics['gold_words_len']

                # Weight factor = (L_i^k)
                length_weight = (L_i ** k)
                # Weighted contribution for word accuracy
                weighted_contribution = length_weight * record_word_acc

                # Accumulate for file-level
                total_levenshtein_distance += metrics['levenshtein_distance']
                total_matched_chars += metrics['matched_chars']
                total_gold_chars += metrics['gold_length']

                # Accumulate for average word accuracy
                total_matched_words += metrics['matched_words']
                total_gold_words += metrics['gold_words_len']

                # Weighted Word Accuracy sums
                sum_weighted_word_contrib += weighted_contribution
                sum_length_weights += length_weight

                sum_sentence_accuracy += metrics['sentence_accuracy']
                record_count += 1

                # Log per-record details
                out_f.write(f"Record #{idx}\n")
                out_f.write(f"  cipher_text: {cipher_text}\n")
                out_f.write(f"  gold_label:  {gold_label}\n")
                out_f.write(f"  Levenshtein Error Rate: {metrics['levenshtein_error_rate']:.4f}\n")
                out_f.write(f"  Character Error Rate:   {metrics['character_error_rate']:.4f}\n")
                out_f.write(f"  Word Accuracy (record, raw): {record_word_acc:.4f}\n")
                out_f.write(f"  Sentence Accuracy: {metrics['sentence_accuracy']:.4f}\n")

                # print matched chars, matched words, gold chars, gold words
                out_f.write(f"  matched_characters: {metrics['matched_chars']}\n")
                out_f.write(f"  gold_characters:    {metrics['gold_length']}\n")
                out_f.write(f"  matched_words:      {metrics['matched_words']}\n")
                out_f.write(f"  gold_words_len:     {metrics['gold_words_len']}\n")

                # Show the weighting details
                out_f.write(f"  Word length (L_i): {L_i},  k={k},  Weight factor = L_i^k = {length_weight}\n")
                out_f.write(f"  Weighted Word Contribution = {length_weight:.2f} * {record_word_acc:.4f} = {weighted_contribution:.4f}\n\n")

            # Final file-level Levenshtein Error Rate
            if total_gold_chars > 0:
                file_levenshtein_error_rate = total_levenshtein_distance / total_gold_chars
            else:
                file_levenshtein_error_rate = 0.0

            # Final file-level Character Error Rate
            if total_gold_chars > 0:
                file_character_error_rate = 1 - (total_matched_chars / total_gold_chars)
            else:
                file_character_error_rate = 0.0

            # Final Weighted Word Accuracy
            if sum_length_weights > 0:
                file_weighted_word_accuracy = sum_weighted_word_contrib / sum_length_weights
            else:
                file_weighted_word_accuracy = 0.0

            # Sentence Accuracy (average)
            if record_count > 0:
                file_sentence_accuracy = sum_sentence_accuracy / record_count
            else:
                file_sentence_accuracy = 0.0

            # NEW: file-level average (unweighted) word accuracy
            if total_gold_words > 0:
                file_average_word_accuracy = total_matched_words / total_gold_words
            else:
                file_average_word_accuracy = 0.0

            # Write file summary
            out_f.write(f"--- Summary for file: {filename} ---\n")
            out_f.write(f"Total records processed: {record_count}\n\n")
            out_f.write(f"Weighted Levenshtein Error Rate (File-level):  {file_levenshtein_error_rate:.4f}\n")
            out_f.write(f"Weighted Character Error Rate (File-level):    {file_character_error_rate:.4f}\n")
            out_f.write(f"Weighted Word Accuracy (File-level, k={k}):     {file_weighted_word_accuracy:.4f}\n")
            out_f.write(f"Average Sentence Accuracy (File-level):        {file_sentence_accuracy:.4f}\n")
            out_f.write(f"Average Word Accuracy (File-level, unweighted): {file_average_word_accuracy:.4f}\n\n")

            # NEW: file-level totals
            out_f.write(f"File-level totals:\n")
            out_f.write(f"  total_characters_in_gold_label: {total_gold_chars}\n")
            out_f.write(f"  total_matched_characters:       {total_matched_chars}\n")
            out_f.write(f"  total_words_in_gold_label:      {total_gold_words}\n")
            out_f.write(f"  total_matched_words:            {total_matched_words}\n")
            out_f.write("==========================================\n\n")

        # Return a dictionary of file-level results (used for CSV)
        return {
            'filename': filename,
            'model_name': fields.get('model_name', ''),
            'shift': fields.get('shift', ''),
            'prompt_type': fields.get('prompt_type', ''),
            'temperature': fields.get('temperature', ''),
            'max_token': fields.get('max_token', ''),
            'levenshtein_error_rate': file_levenshtein_error_rate,
            'character_error_rate': file_character_error_rate,
            'weighted_word_accuracy': file_weighted_word_accuracy,
            'sentence_accuracy': file_sentence_accuracy,
            # Include the new average word accuracy in the CSV
            'avg_word_accuracy': file_average_word_accuracy
        }

    def process_directory(self):
        """
        Iterates over all .json files in the directory, processes each, 
        and accumulates final metrics for CSV output.
        """
        for filename in os.listdir(self.directory):
            if filename.endswith('.json'):
                filepath = os.path.join(self.directory, filename)
                filename_without_ext = filename[:-5]
                
                # Extract timestamp from filename to match with config
                timestamp = self.extract_timestamp(filename_without_ext)
                fields = self.config_data.get(timestamp)
                
                if not fields:
                    logging.warning(
                        f"Timestamp '{timestamp}' from filename '{filename_without_ext}' "
                        f"not found in config file {self.config_file}"
                    )
                    continue

                file_results = self.process_json_file(filepath, fields)
                if file_results:
                    self.file_level_results.append(file_results)

    def write_csv_results(self):
        """
        Writes the accumulated file-level metrics to a CSV file, including
        Weighted Word Accuracy (k=2), Average Word Accuracy, etc.
        """
        csv_file = self.config.csv_file_result

        def shift_as_int(val):
            try:
                return int(val['shift'])
            except ValueError:
                return 999999  # fallback if not numeric

        self.file_level_results.sort(key=lambda x: (x['model_name'], shift_as_int(x), x['prompt_type']))

        with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                'Filename', 
                'Model', 
                'Shift', 
                'Prompt Type', 
                'Temperature', 
                'Max Token',
                'Levenshtein Error Rate (%)', 
                'Character Error Rate (%)',
                'Weighted Word Accuracy (%)',
                'Sentence Accuracy (%)',
                'Average Word Accuracy (%)'
            ])
            for res in self.file_level_results:
                writer.writerow([
                    res['filename'],
                    res['model_name'],
                    res['shift'],
                    res['prompt_type'],
                    res['temperature'],
                    res['max_token'],
                    f"{res['levenshtein_error_rate'] * 100:.2f}",
                    f"{res['character_error_rate'] * 100:.2f}",
                    f"{res['weighted_word_accuracy'] * 100:.2f}",
                    f"{res['sentence_accuracy'] * 100:.2f}",
                    f"{res['avg_word_accuracy'] * 100:.2f}"
                ])
        logging.info(f"CSV results written to '{csv_file}'")

    def run(self):
        """
        Main entry point: processes the JSON files, then writes final CSV results.
        """
        self.process_directory()
        self.write_csv_results()
        logging.info(f"Processing finished.\n"
                     f"  - Detailed record-level results: '{self.config.output_details_file}'\n"
                     f"  - File-level summary CSV: '{self.config.csv_file_result}'")

if __name__ == '__main__':
    config = Config()
    caesar_test = CaesarCipherTest(config)
    caesar_test.run()


In [None]:
# =======================
# RANDOM TEXT INPUT
# =======================

import os
import json
import csv
import logging
import re
from collections import defaultdict
from typing import Dict, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

class Config:
    def __init__(self):
        # Directory containing the JSON files
        self.data_directory = './data/encoded/caesar-cipher/random/'
        # Path to the configuration file
        self.config_file_path = './caesar-cipher-experiments-log-random-text.txt'
        # CSV output file for aggregated metrics
        self.csv_file_result = 'caesar-cipher-random-text-benchmark-result.csv'
        # Text file containing all details
        self.output_details_file = 'caesar-cipher-random-text-benchmark-process.txt'

class CaesarCipherTest:
    def __init__(self, config: Config):
        self.config = config
        self.directory = self.config.data_directory
        self.config_file = self.config.config_file_path
        # Parse config file once for metadata
        self.config_data = self.parse_config_file()

        # Exponent for weighting sequences by their word length
        # (k=2 means longer sequences are squared in their influence)
        self.word_weight_exponent = 2

        # Accumulate final (file-level) results for the CSV
        self.file_level_results = []
        
        # Clear or create the output details file at the start
        with open(self.config.output_details_file, 'w', encoding='utf-8') as f:
            f.write("Comprehensive operation details for random text results:\n\n")

    def parse_config_file(self) -> Dict[str, Dict[str, str]]:
        config_data = {}
        try:
            with open(self.config_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        timestamp = self.extract_timestamp(line)
                        fields = self.parse_filename(line)
                        config_data[timestamp] = fields
        except FileNotFoundError:
            logging.error(f"Config file {self.config_file} not found.")
        return config_data

    @staticmethod
    def extract_timestamp(filename: str) -> str:
        """
        Example: 20230101_122345_some-other-stuff -> timestamp would be 20230101_122345
        """
        parts = filename.split('_')
        if len(parts) >= 2:
            return f"{parts[0]}_{parts[1]}"
        return parts[0]

    @staticmethod
    def parse_filename(filename: str) -> Dict[str, str]:
        """
        A robust approach that handles both Llama-like (__0.01_64) 
        and Mistral-like (_0.01_64) suffixes.
        """
        if filename.endswith('.json'):
            filename = filename[:-5]

        pattern = r'^(.*)_([\d\.]+)_(\d+)$'
        match = re.match(pattern, filename)
        if match:
            pre_model_info = match.group(1)
            temperature   = match.group(2)
            max_token     = match.group(3)
        else:
            pre_model_info = filename
            temperature    = ''
            max_token      = ''

        pre_fields = pre_model_info.split('_')

        date        = pre_fields[0] if len(pre_fields) > 0 else ''
        time        = pre_fields[1] if len(pre_fields) > 1 else ''
        samples     = pre_fields[2] if len(pre_fields) > 2 else ''
        shift       = pre_fields[3] if len(pre_fields) > 3 else ''
        prompt_type = '_'.join(pre_fields[4:6]) if len(pre_fields) > 5 else ''
        method      = pre_fields[6] if len(pre_fields) > 6 else ''

        shot_and_model_name_parts = pre_fields[7:]
        shot_and_model_name       = '_'.join(shot_and_model_name_parts)

        if '-' in shot_and_model_name:
            shot, model_name = shot_and_model_name.split('-', 1)
        else:
            shot = ''
            model_name = shot_and_model_name

        if model_name.startswith('models--'):
            model_name = model_name[len('models--'):]
            model_parts = model_name.split('--')
            model_name  = model_parts[-1]
        model_name = model_name.rstrip('_')

        return {
            'date': date,
            'time': time,
            'samples': samples,
            'shift': shift,
            'prompt_type': prompt_type,
            'method': method,
            'shot': shot,
            'model_name': model_name,
            'temperature': temperature,
            'max_token': max_token
        }

    @staticmethod
    def levenshtein_distance(s1: str, s2: str) -> int:
        """
        Compute the Levenshtein edit distance between two strings s1 and s2
        (minimum number of single-character edits).
        """
        if not s1:
            return len(s2)
        if not s2:
            return len(s1)

        len_s1 = len(s1)
        len_s2 = len(s2)

        dp = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

        for i in range(len_s1 + 1):
            dp[i][0] = i
        for j in range(len_s2 + 1):
            dp[0][j] = j

        for i in range(1, len_s1 + 1):
            for j in range(1, len_s2 + 1):
                cost = 0 if s1[i-1] == s2[j-1] else 1
                dp[i][j] = min(
                    dp[i-1][j] + 1,       # deletion
                    dp[i][j-1] + 1,       # insertion
                    dp[i-1][j-1] + cost   # substitution
                )

        return dp[len_s1][len_s2]

    def compute_per_record_metrics(self, cipher_text: str, gold_label: str) -> Dict[str, float]:
        """
        Compute per-record metrics:
          1) Levenshtein Error Rate: dist / len(gold_label)
          2) Character Error Rate:   1 - (# matched chars / len(gold_label))
          3) Word Accuracy (raw fraction for a single record)
          4) Sentence Accuracy: 1.0 if exact match else 0.0
        """
        gold_len = len(gold_label)
        
        # 1) Levenshtein Error Rate
        if gold_len > 0:
            dist = self.levenshtein_distance(cipher_text, gold_label)
            levenshtein_error_rate = dist / gold_len
            # Cap the Levenshtein Error Rate at 100 if it exceeds this threshold
            if levenshtein_error_rate > 1:
                levenshtein_error_rate = 1.0000
                dist = gold_len
        else:
            dist = 0
            levenshtein_error_rate = 0.0

        # 2) Character Error Rate
        matched_chars = 0
        for i in range(gold_len):
            if i < len(cipher_text) and cipher_text[i] == gold_label[i]:
                matched_chars += 1

        if gold_len > 0:
            character_error_rate = 1 - (matched_chars / gold_len)
        else:
            character_error_rate = 0.0

        # 3) Word Accuracy (unweighted, just per-record fraction)
        gold_words = gold_label.split()
        pred_words = cipher_text.split()
        matched_words = sum(gw == pw for gw, pw in zip(gold_words, pred_words))
        gold_words_len = len(gold_words)

        if gold_words_len > 0:
            word_accuracy = matched_words / gold_words_len
        else:
            word_accuracy = 0.0

        # 4) Sentence Accuracy
        sentence_accuracy = 1.0 if cipher_text == gold_label else 0.0

        return {
            'levenshtein_error_rate': levenshtein_error_rate,
            'character_error_rate': character_error_rate,
            'word_accuracy': word_accuracy,     # raw fraction for this record
            'sentence_accuracy': sentence_accuracy,
            # Additional for file-level weighting
            'levenshtein_distance': dist,
            'matched_chars': matched_chars,
            'gold_length': gold_len,
            'matched_words': matched_words,
            'gold_words_len': gold_words_len
        }

    def process_json_file(self, filepath: str, fields: Dict[str, str]) -> Optional[Dict[str, float]]:
        """
        Process a single JSON file:
         - For each record: compute per-record metrics and log them.
         - For file-level: 
             * Weighted word accuracy (using L^k).
             * Weighted Levenshtein and Character Error Rate.
             * Average Sentence Accuracy.
             * NEW: total matched words vs. total gold words => file-level average word accuracy.
        """
        filename = os.path.basename(filepath)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if not content:
                    logging.warning(f"Skipping empty file: {filename}")
                    return None
                data = json.loads(content)
        except (json.JSONDecodeError, FileNotFoundError):
            logging.error(f"Error reading or decoding JSON in file: {filename}")
            return None

        if not isinstance(data, list):
            logging.error(f"File {filename} does not contain a list of records.")
            return None

        # Accumulators for file-level (weighted or otherwise)
        total_levenshtein_distance = 0
        total_matched_chars = 0
        total_gold_chars = 0

        # Weighted Word Accuracy accumulators
        sum_weighted_word_contrib = 0.0
        sum_length_weights = 0.0

        # Sentence accuracy accumulators
        sum_sentence_accuracy = 0.0
        record_count = 0

        # accumulators for average word accuracy
        total_matched_words = 0
        total_gold_words = 0

        # Start logging to the details file
        with open(self.config.output_details_file, 'a', encoding='utf-8') as out_f:
            out_f.write(f"=== Processing file: {filename} ===\n")
            out_f.write(f"Model name: {fields.get('model_name', '')}, Shift: {fields.get('shift', '')}, "
                        f"Prompt Type: {fields.get('prompt_type', '')}, Temperature: {fields.get('temperature', '')}, "
                        f"Max Token: {fields.get('max_token', '')}\n\n")

            # k exponent for weighting word accuracy
            k = self.word_weight_exponent

            # Process each record
            for idx, record in enumerate(data, start=1):
                cipher_text = record.get("cipher_text", "").strip()
                gold_label = record.get("gold_label", "").strip()

                if not cipher_text and not gold_label:
                    continue

                metrics = self.compute_per_record_metrics(cipher_text, gold_label)

                # Basic record-level fraction
                record_word_acc = metrics['word_accuracy']
                L_i = metrics['gold_words_len']

                # Weight factor = (L_i^k)
                length_weight = (L_i ** k)
                # Weighted contribution for word accuracy
                weighted_contribution = length_weight * record_word_acc

                # Accumulate for file-level
                total_levenshtein_distance += metrics['levenshtein_distance']
                total_matched_chars += metrics['matched_chars']
                total_gold_chars += metrics['gold_length']

                # Accumulate for average word accuracy
                total_matched_words += metrics['matched_words']
                total_gold_words += metrics['gold_words_len']

                # Weighted Word Accuracy sums
                sum_weighted_word_contrib += weighted_contribution
                sum_length_weights += length_weight

                sum_sentence_accuracy += metrics['sentence_accuracy']
                record_count += 1

                # Log per-record details
                out_f.write(f"Record #{idx}\n")
                out_f.write(f"  cipher_text: {cipher_text}\n")
                out_f.write(f"  gold_label:  {gold_label}\n")
                out_f.write(f"  Levenshtein Error Rate: {metrics['levenshtein_error_rate']:.4f}\n")
                out_f.write(f"  Character Error Rate:   {metrics['character_error_rate']:.4f}\n")
                out_f.write(f"  Word Accuracy (record, raw): {record_word_acc:.4f}\n")
                out_f.write(f"  Sentence Accuracy: {metrics['sentence_accuracy']:.4f}\n")

                # print matched chars, matched words, gold chars, gold words
                out_f.write(f"  matched_characters: {metrics['matched_chars']}\n")
                out_f.write(f"  gold_characters:    {metrics['gold_length']}\n")
                out_f.write(f"  matched_words:      {metrics['matched_words']}\n")
                out_f.write(f"  gold_words_len:     {metrics['gold_words_len']}\n")

                # Show the weighting details
                out_f.write(f"  Word length (L_i): {L_i},  k={k},  Weight factor = L_i^k = {length_weight}\n")
                out_f.write(f"  Weighted Word Contribution = {length_weight:.2f} * {record_word_acc:.4f} = {weighted_contribution:.4f}\n\n")

            # Final file-level Levenshtein Error Rate
            if total_gold_chars > 0:
                file_levenshtein_error_rate = total_levenshtein_distance / total_gold_chars
            else:
                file_levenshtein_error_rate = 0.0

            # Final file-level Character Error Rate
            if total_gold_chars > 0:
                file_character_error_rate = 1 - (total_matched_chars / total_gold_chars)
            else:
                file_character_error_rate = 0.0

            # Final Weighted Word Accuracy
            if sum_length_weights > 0:
                file_weighted_word_accuracy = sum_weighted_word_contrib / sum_length_weights
            else:
                file_weighted_word_accuracy = 0.0

            # Sentence Accuracy (average)
            if record_count > 0:
                file_sentence_accuracy = sum_sentence_accuracy / record_count
            else:
                file_sentence_accuracy = 0.0

            # NEW: file-level average (unweighted) word accuracy
            if total_gold_words > 0:
                file_average_word_accuracy = total_matched_words / total_gold_words
            else:
                file_average_word_accuracy = 0.0

            # Write file summary
            out_f.write(f"--- Summary for file: {filename} ---\n")
            out_f.write(f"Total records processed: {record_count}\n\n")
            out_f.write(f"Weighted Levenshtein Error Rate (File-level):  {file_levenshtein_error_rate:.4f}\n")
            out_f.write(f"Weighted Character Error Rate (File-level):    {file_character_error_rate:.4f}\n")
            out_f.write(f"Weighted Word Accuracy (File-level, k={k}):     {file_weighted_word_accuracy:.4f}\n")
            out_f.write(f"Average Sentence Accuracy (File-level):        {file_sentence_accuracy:.4f}\n")
            out_f.write(f"Average Word Accuracy (File-level, unweighted): {file_average_word_accuracy:.4f}\n\n")

            # NEW: file-level totals
            out_f.write(f"File-level totals:\n")
            out_f.write(f"  total_characters_in_gold_label: {total_gold_chars}\n")
            out_f.write(f"  total_matched_characters:       {total_matched_chars}\n")
            out_f.write(f"  total_words_in_gold_label:      {total_gold_words}\n")
            out_f.write(f"  total_matched_words:            {total_matched_words}\n")
            out_f.write("==========================================\n\n")

        # Return a dictionary of file-level results (used for CSV)
        return {
            'filename': filename,
            'model_name': fields.get('model_name', ''),
            'shift': fields.get('shift', ''),
            'prompt_type': fields.get('prompt_type', ''),
            'temperature': fields.get('temperature', ''),
            'max_token': fields.get('max_token', ''),
            'levenshtein_error_rate': file_levenshtein_error_rate,
            'character_error_rate': file_character_error_rate,
            'weighted_word_accuracy': file_weighted_word_accuracy,
            'sentence_accuracy': file_sentence_accuracy,
            # Include the new average word accuracy in the CSV
            'avg_word_accuracy': file_average_word_accuracy
        }

    def process_directory(self):
        """
        Iterates over all .json files in the directory, processes each, 
        and accumulates final metrics for CSV output.
        """
        for filename in os.listdir(self.directory):
            if filename.endswith('.json'):
                filepath = os.path.join(self.directory, filename)
                filename_without_ext = filename[:-5]
                
                # Extract timestamp from filename to match with config
                timestamp = self.extract_timestamp(filename_without_ext)
                fields = self.config_data.get(timestamp)
                
                if not fields:
                    logging.warning(
                        f"Timestamp '{timestamp}' from filename '{filename_without_ext}' "
                        f"not found in config file {self.config_file}"
                    )
                    continue

                file_results = self.process_json_file(filepath, fields)
                if file_results:
                    self.file_level_results.append(file_results)

    def write_csv_results(self):
        """
        Writes the accumulated file-level metrics to a CSV file, including
        Weighted Word Accuracy (k=2), Average Word Accuracy, etc.
        """
        csv_file = self.config.csv_file_result

        def shift_as_int(val):
            try:
                return int(val['shift'])
            except ValueError:
                return 999999  # fallback if not numeric

        self.file_level_results.sort(key=lambda x: (x['model_name'], shift_as_int(x), x['prompt_type']))

        with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                'Filename', 
                'Model', 
                'Shift', 
                'Prompt Type', 
                'Temperature', 
                'Max Token',
                'Levenshtein Error Rate (%)', 
                'Character Error Rate (%)',
                'Weighted Word Accuracy (%)',
                'Sentence Accuracy (%)',
                'Average Word Accuracy (%)'
            ])
            for res in self.file_level_results:
                writer.writerow([
                    res['filename'],
                    res['model_name'],
                    res['shift'],
                    res['prompt_type'],
                    res['temperature'],
                    res['max_token'],
                    f"{res['levenshtein_error_rate'] * 100:.2f}",
                    f"{res['character_error_rate'] * 100:.2f}",
                    f"{res['weighted_word_accuracy'] * 100:.2f}",
                    f"{res['sentence_accuracy'] * 100:.2f}",
                    f"{res['avg_word_accuracy'] * 100:.2f}"
                ])
        logging.info(f"CSV results written to '{csv_file}'")

    def run(self):
        """
        Main entry point: processes the JSON files, then writes final CSV results.
        """
        self.process_directory()
        self.write_csv_results()
        logging.info(f"Processing finished.\n"
                     f"  - Detailed record-level results: '{self.config.output_details_file}'\n"
                     f"  - File-level summary CSV: '{self.config.csv_file_result}'")

if __name__ == '__main__':
    config = Config()
    caesar_test = CaesarCipherTest(config)
    caesar_test.run()


In [None]:
# =======================
# GREEK TEXT INPUT
# =======================

import os
import json
import csv
import logging
import re
import unicodedata
from typing import Dict, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

def normalize_greek_text(text: str) -> str:
    """
    Normalize Greek text to a consistent Unicode form (e.g., NFC).
    This helps ensure that any accented Greek letters and their
    combining marks are treated uniformly.

    If your Caesar logic requires ignoring diacritics altogether,
    you can modify this function to strip them.
    """
    # Normalizing to NFC is usually sufficient for comparing Greek text.
    return unicodedata.normalize("NFC", text)

class Config:
    def __init__(self):
        # Directory containing the JSON files with Greek Caesar-cipher data
        self.data_directory = './data/encoded/caesar-cipher/greek/'
        # Path to the configuration file
        self.config_file_path = './caesar-cipher-experiments-log-greek-text.txt'
        # CSV output file for aggregated metrics
        self.csv_file_result = 'caesar-cipher-greek-text-benchmark-result.csv'
        # Text file containing all details
        self.output_details_file = 'caesar-cipher-greek-text-benchmark-process.txt'

class CaesarCipherTest:
    def __init__(self, config: Config):
        self.config = config
        self.directory = self.config.data_directory
        self.config_file = self.config.config_file_path

        # Parse config file once for metadata
        self.config_data = self.parse_config_file()

        # Exponent for weighting sequences by their word length
        # (k=2 means longer sequences have higher influence)
        self.word_weight_exponent = 2

        # Accumulate final (file-level) results for the CSV
        self.file_level_results = []
        
        # Clear or create the output details file at the start
        with open(self.config.output_details_file, 'w', encoding='utf-8') as f:
            f.write("Comprehensive operation details for Greek text results:\n\n")

    def parse_config_file(self) -> Dict[str, Dict[str, str]]:
        config_data = {}
        try:
            with open(self.config_file, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        timestamp = self.extract_timestamp(line)
                        fields = self.parse_filename(line)
                        config_data[timestamp] = fields
        except FileNotFoundError:
            logging.error(f"Config file {self.config_file} not found.")
        return config_data

    @staticmethod
    def extract_timestamp(filename: str) -> str:
        """
        Example: 20230101_122345_some-other-stuff -> timestamp would be 20230101_122345
        """
        parts = filename.split('_')
        if len(parts) >= 2:
            return f"{parts[0]}_{parts[1]}"
        return parts[0]

    @staticmethod
    def parse_filename(filename: str) -> Dict[str, str]:
        """
        A robust approach that handles both Llama-like (__0.01_64) 
        and Mistral-like (_0.01_64) suffixes.
        """
        if filename.endswith('.json'):
            filename = filename[:-5]

        pattern = r'^(.*)_([\d\.]+)_(\d+)$'
        match = re.match(pattern, filename)
        if match:
            pre_model_info = match.group(1)
            temperature   = match.group(2)
            max_token     = match.group(3)
        else:
            pre_model_info = filename
            temperature    = ''
            max_token      = ''

        pre_fields = pre_model_info.split('_')

        date        = pre_fields[0] if len(pre_fields) > 0 else ''
        time        = pre_fields[1] if len(pre_fields) > 1 else ''
        samples     = pre_fields[2] if len(pre_fields) > 2 else ''
        shift       = pre_fields[3] if len(pre_fields) > 3 else ''
        prompt_type = '_'.join(pre_fields[4:6]) if len(pre_fields) > 5 else ''
        method      = pre_fields[6] if len(pre_fields) > 6 else ''

        shot_and_model_name_parts = pre_fields[7:]
        shot_and_model_name       = '_'.join(shot_and_model_name_parts)

        if '-' in shot_and_model_name:
            shot, model_name = shot_and_model_name.split('-', 1)
        else:
            shot = ''
            model_name = shot_and_model_name

        if model_name.startswith('models--'):
            model_name = model_name[len('models--'):]
            model_parts = model_name.split('--')
            model_name  = model_parts[-1]
        model_name = model_name.rstrip('_')

        return {
            'date': date,
            'time': time,
            'samples': samples,
            'shift': shift,
            'prompt_type': prompt_type,
            'method': method,
            'shot': shot,
            'model_name': model_name,
            'temperature': temperature,
            'max_token': max_token
        }

    @staticmethod
    def levenshtein_distance(s1: str, s2: str) -> int:
        """
        Compute the Levenshtein edit distance between two strings s1 and s2
        (minimum number of single-character edits).
        """
        if not s1:
            return len(s2)
        if not s2:
            return len(s1)

        len_s1 = len(s1)
        len_s2 = len(s2)

        dp = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

        for i in range(len_s1 + 1):
            dp[i][0] = i
        for j in range(len_s2 + 1):
            dp[0][j] = j

        for i in range(1, len_s1 + 1):
            for j in range(1, len_s2 + 1):
                cost = 0 if s1[i-1] == s2[j-1] else 1
                dp[i][j] = min(
                    dp[i-1][j] + 1,       # deletion
                    dp[i][j-1] + 1,       # insertion
                    dp[i-1][j-1] + cost   # substitution
                )

        return dp[len_s1][len_s2]

    def compute_per_record_metrics(self, cipher_text: str, gold_label: str) -> Dict[str, float]:
        """
        Compute per-record metrics:
          1) Levenshtein Error Rate: dist / len(gold_label)
          2) Character Error Rate:   1 - (# matched chars / len(gold_label))
          3) Word Accuracy (raw fraction for a single record)
          4) Sentence Accuracy: 1.0 if exact match else 0.0
        """

        # Normalize both strings so that diacritics are handled consistently.
        cipher_text = normalize_greek_text(cipher_text)
        gold_label  = normalize_greek_text(gold_label)

        gold_len = len(gold_label)
        
        # 1) Levenshtein Error Rate
        if gold_len > 0:
            dist = self.levenshtein_distance(cipher_text, gold_label)
            levenshtein_error_rate = dist / gold_len
            # Cap the Levenshtein Error Rate at 100 if it exceeds this threshold
            if levenshtein_error_rate > 1:
                levenshtein_error_rate = 1.0000
                dist = gold_len
        else:
            dist = 0
            levenshtein_error_rate = 0.0

        # 2) Character Error Rate
        matched_chars = 0
        for i in range(gold_len):
            if i < len(cipher_text) and cipher_text[i] == gold_label[i]:
                matched_chars += 1

        if gold_len > 0:
            character_error_rate = 1 - (matched_chars / gold_len)
        else:
            character_error_rate = 0.0

        # 3) Word Accuracy (unweighted, just per-record fraction)
        gold_words = gold_label.split()
        pred_words = cipher_text.split()
        matched_words = sum(gw == pw for gw, pw in zip(gold_words, pred_words))
        gold_words_len = len(gold_words)

        if gold_words_len > 0:
            word_accuracy = matched_words / gold_words_len
        else:
            word_accuracy = 0.0

        # 4) Sentence Accuracy
        sentence_accuracy = 1.0 if cipher_text == gold_label else 0.0

        return {
            'levenshtein_error_rate': levenshtein_error_rate,
            'character_error_rate': character_error_rate,
            'word_accuracy': word_accuracy,     # raw fraction for this record
            'sentence_accuracy': sentence_accuracy,
            # Additional for file-level weighting
            'levenshtein_distance': dist,
            'matched_chars': matched_chars,
            'gold_length': gold_len,
            'matched_words': matched_words,
            'gold_words_len': gold_words_len
        }

    def process_json_file(self, filepath: str, fields: Dict[str, str]) -> Optional[Dict[str, float]]:
        """
        Process a single JSON file containing Greek Caesar-cipher data:
         - For each record: compute per-record metrics and log them.
         - For file-level: 
             * Weighted word accuracy (using L^k).
             * Weighted Levenshtein and Character Error Rate.
             * Average Sentence Accuracy.
             * total matched words vs. total gold words => file-level average word accuracy.
        """
        filename = os.path.basename(filepath)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if not content:
                    logging.warning(f"Skipping empty file: {filename}")
                    return None
                data = json.loads(content)
        except (json.JSONDecodeError, FileNotFoundError):
            logging.error(f"Error reading or decoding JSON in file: {filename}")
            return None

        if not isinstance(data, list):
            logging.error(f"File {filename} does not contain a list of records.")
            return None

        # Accumulators for file-level (weighted or otherwise)
        total_levenshtein_distance = 0
        total_matched_chars = 0
        total_gold_chars = 0

        # Weighted Word Accuracy accumulators
        sum_weighted_word_contrib = 0.0
        sum_length_weights = 0.0

        # Sentence accuracy accumulators
        sum_sentence_accuracy = 0.0
        record_count = 0

        # Accumulators for average word accuracy
        total_matched_words = 0
        total_gold_words = 0

        # Start logging to the details file
        with open(self.config.output_details_file, 'a', encoding='utf-8') as out_f:
            out_f.write(f"=== Processing file: {filename} ===\n")
            out_f.write(f"Model name: {fields.get('model_name', '')}, Shift: {fields.get('shift', '')}, "
                        f"Prompt Type: {fields.get('prompt_type', '')}, Temperature: {fields.get('temperature', '')}, "
                        f"Max Token: {fields.get('max_token', '')}\n\n")

            # k exponent for weighting word accuracy
            k = self.word_weight_exponent

            # Process each record
            for idx, record in enumerate(data, start=1):
                # Normalize inputs for Greek
                cipher_text = normalize_greek_text(record.get("cipher_text", "").strip())
                gold_label  = normalize_greek_text(record.get("gold_label", "").strip())

                if not cipher_text and not gold_label:
                    continue

                metrics = self.compute_per_record_metrics(cipher_text, gold_label)

                # Basic record-level fraction
                record_word_acc = metrics['word_accuracy']
                L_i = metrics['gold_words_len']

                # Weight factor = (L_i ** k)
                length_weight = (L_i ** k)
                # Weighted contribution for word accuracy
                weighted_contribution = length_weight * record_word_acc

                # Accumulate for file-level
                total_levenshtein_distance += metrics['levenshtein_distance']
                total_matched_chars += metrics['matched_chars']
                total_gold_chars += metrics['gold_length']

                # Accumulate for average word accuracy
                total_matched_words += metrics['matched_words']
                total_gold_words += metrics['gold_words_len']

                # Weighted Word Accuracy sums
                sum_weighted_word_contrib += weighted_contribution
                sum_length_weights += length_weight

                sum_sentence_accuracy += metrics['sentence_accuracy']
                record_count += 1

                # Log per-record details
                out_f.write(f"Record #{idx}\n")
                out_f.write(f"  cipher_text: {cipher_text}\n")
                out_f.write(f"  gold_label:  {gold_label}\n")
                out_f.write(f"  Levenshtein Error Rate: {metrics['levenshtein_error_rate']:.4f}\n")
                out_f.write(f"  Character Error Rate:   {metrics['character_error_rate']:.4f}\n")
                out_f.write(f"  Word Accuracy (record, raw): {record_word_acc:.4f}\n")
                out_f.write(f"  Sentence Accuracy: {metrics['sentence_accuracy']:.4f}\n")

                out_f.write(f"  matched_characters: {metrics['matched_chars']}\n")
                out_f.write(f"  gold_characters:    {metrics['gold_length']}\n")
                out_f.write(f"  matched_words:      {metrics['matched_words']}\n")
                out_f.write(f"  gold_words_len:     {metrics['gold_words_len']}\n")

                # Show the weighting details
                out_f.write(f"  Word length (L_i): {L_i},  k={k},  Weight factor = L_i^k = {length_weight}\n")
                out_f.write(f"  Weighted Word Contribution = {length_weight:.2f} * {record_word_acc:.4f} = {weighted_contribution:.4f}\n\n")

            # Final file-level Levenshtein Error Rate
            if total_gold_chars > 0:
                file_levenshtein_error_rate = total_levenshtein_distance / total_gold_chars
            else:
                file_levenshtein_error_rate = 0.0

            # Final file-level Character Error Rate
            if total_gold_chars > 0:
                file_character_error_rate = 1 - (total_matched_chars / total_gold_chars)
            else:
                file_character_error_rate = 0.0

            # Final Weighted Word Accuracy
            if sum_length_weights > 0:
                file_weighted_word_accuracy = sum_weighted_word_contrib / sum_length_weights
            else:
                file_weighted_word_accuracy = 0.0

            # Sentence Accuracy (average)
            if record_count > 0:
                file_sentence_accuracy = sum_sentence_accuracy / record_count
            else:
                file_sentence_accuracy = 0.0

            # File-level average (unweighted) word accuracy
            if total_gold_words > 0:
                file_average_word_accuracy = total_matched_words / total_gold_words
            else:
                file_average_word_accuracy = 0.0

            # Write file summary
            out_f.write(f"--- Summary for file: {filename} ---\n")
            out_f.write(f"Total records processed: {record_count}\n\n")
            out_f.write(f"Weighted Levenshtein Error Rate (File-level):  {file_levenshtein_error_rate:.4f}\n")
            out_f.write(f"Weighted Character Error Rate (File-level):    {file_character_error_rate:.4f}\n")
            out_f.write(f"Weighted Word Accuracy (File-level, k={k}):     {file_weighted_word_accuracy:.4f}\n")
            out_f.write(f"Average Sentence Accuracy (File-level):        {file_sentence_accuracy:.4f}\n")
            out_f.write(f"Average Word Accuracy (File-level, unweighted): {file_average_word_accuracy:.4f}\n\n")

            out_f.write(f"File-level totals:\n")
            out_f.write(f"  total_characters_in_gold_label: {total_gold_chars}\n")
            out_f.write(f"  total_matched_characters:       {total_matched_chars}\n")
            out_f.write(f"  total_words_in_gold_label:      {total_gold_words}\n")
            out_f.write(f"  total_matched_words:            {total_matched_words}\n")
            out_f.write("==========================================\n\n")

        # Return a dictionary of file-level results (used for CSV)
        return {
            'filename': filename,
            'model_name': fields.get('model_name', ''),
            'shift': fields.get('shift', ''),
            'prompt_type': fields.get('prompt_type', ''),
            'temperature': fields.get('temperature', ''),
            'max_token': fields.get('max_token', ''),
            'levenshtein_error_rate': file_levenshtein_error_rate,
            'character_error_rate': file_character_error_rate,
            'weighted_word_accuracy': file_weighted_word_accuracy,
            'sentence_accuracy': file_sentence_accuracy,
            'avg_word_accuracy': file_average_word_accuracy
        }

    def process_directory(self):
        """
        Iterates over all .json files in the directory, processes each, 
        and accumulates final metrics for CSV output.
        """
        for filename in os.listdir(self.directory):
            if filename.endswith('.json'):
                filepath = os.path.join(self.directory, filename)
                filename_without_ext = filename[:-5]
                
                # Extract timestamp from filename to match with config
                timestamp = self.extract_timestamp(filename_without_ext)
                fields = self.config_data.get(timestamp)
                
                if not fields:
                    logging.warning(
                        f"Timestamp '{timestamp}' from filename '{filename_without_ext}' "
                        f"not found in config file {self.config_file}"
                    )
                    continue

                file_results = self.process_json_file(filepath, fields)
                if file_results:
                    self.file_level_results.append(file_results)

    def write_csv_results(self):
        """
        Writes the accumulated file-level metrics to a CSV file, including
        Weighted Word Accuracy (k=2), Average Word Accuracy, etc.
        """
        csv_file = self.config.csv_file_result

        def shift_as_int(val):
            try:
                return int(val['shift'])
            except ValueError:
                return 999999  # fallback if not numeric

        self.file_level_results.sort(key=lambda x: (x['model_name'], shift_as_int(x), x['prompt_type']))

        with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                'Filename', 
                'Model', 
                'Shift', 
                'Prompt Type', 
                'Temperature', 
                'Max Token',
                'Levenshtein Error Rate (%)', 
                'Character Error Rate (%)',
                'Weighted Word Accuracy (%)',
                'Sentence Accuracy (%)',
                'Average Word Accuracy (%)'
            ])
            for res in self.file_level_results:
                writer.writerow([
                    res['filename'],
                    res['model_name'],
                    res['shift'],
                    res['prompt_type'],
                    res['temperature'],
                    res['max_token'],
                    f"{res['levenshtein_error_rate'] * 100:.2f}",
                    f"{res['character_error_rate'] * 100:.2f}",
                    f"{res['weighted_word_accuracy'] * 100:.2f}",
                    f"{res['sentence_accuracy'] * 100:.2f}",
                    f"{res['avg_word_accuracy'] * 100:.2f}"
                ])
        logging.info(f"CSV results written to '{csv_file}'")

    def run(self):
        """
        Main entry point: processes the JSON files, then writes final CSV results.
        """
        self.process_directory()
        self.write_csv_results()
        logging.info(f"Processing finished.\n"
                     f"  - Detailed record-level results: '{self.config.output_details_file}'\n"
                     f"  - File-level summary CSV: '{self.config.csv_file_result}'")

if __name__ == '__main__':
    config = Config()
    caesar_test = CaesarCipherTest(config)
    caesar_test.run()


In [None]:
# =======================
# VIGENERE - NATURAL TEXT INPUT
# =======================


import os
import json
import csv
import logging
import re
from collections import defaultdict
from typing import Dict, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

class Config:
    def __init__(self):
        # Directory containing the JSON files
        self.data_directory = './data/encoded/vigenere-cipher/'
        # Path to the configuration file
        self.config_file_path = './vigenere-cipher-experiments-log.txt'
        # CSV output file for aggregated metrics
        self.csv_file_result = 'vigenere-cipher-natural-text-benchmark-result.csv'
        # Text file containing all details
        self.output_details_file = 'vigenere-cipher-natural-text-benchmark-process.txt'

class CaesarCipherTest:
    def __init__(self, config: Config):
        self.config = config
        self.directory = self.config.data_directory
        self.config_file = self.config.config_file_path
        # Parse config file once for metadata
        self.config_data = self.parse_config_file()

        # Exponent for weighting sequences by their word length
        # (k=2 means longer sequences are squared in their influence)
        self.word_weight_exponent = 2

        # Accumulate final (file-level) results for the CSV
        self.file_level_results = []
        
        # Clear or create the output details file at the start
        with open(self.config.output_details_file, 'w', encoding='utf-8') as f:
            f.write("Comprehensive operation details for random text results:\n\n")

    def parse_config_file(self) -> Dict[str, Dict[str, str]]:
        config_data = {}
        try:
            with open(self.config_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        timestamp = self.extract_timestamp(line)
                        fields = self.parse_filename(line)
                        config_data[timestamp] = fields
        except FileNotFoundError:
            logging.error(f"Config file {self.config_file} not found.")
        return config_data

    @staticmethod
    def extract_timestamp(filename: str) -> str:
        """
        Example: 20230101_122345_some-other-stuff -> timestamp would be 20230101_122345
        """
        parts = filename.split('_')
        if len(parts) >= 2:
            return f"{parts[0]}_{parts[1]}"
        return parts[0]

    @staticmethod
    def parse_filename(filename: str) -> Dict[str, str]:
        """
        A robust approach that handles both Llama-like (__0.01_64) 
        and Mistral-like (_0.01_64) suffixes.
        """
        if filename.endswith('.json'):
            filename = filename[:-5]

        pattern = r'^(.*)_([\d\.]+)_(\d+)$'
        match = re.match(pattern, filename)
        if match:
            pre_model_info = match.group(1)
            temperature   = match.group(2)
            max_token     = match.group(3)
        else:
            pre_model_info = filename
            temperature    = ''
            max_token      = ''

        pre_fields = pre_model_info.split('_')

        date        = pre_fields[0] if len(pre_fields) > 0 else ''
        time        = pre_fields[1] if len(pre_fields) > 1 else ''
        samples     = pre_fields[2] if len(pre_fields) > 2 else ''
        shift       = pre_fields[3] if len(pre_fields) > 3 else ''
        prompt_type = '_'.join(pre_fields[4:6]) if len(pre_fields) > 5 else ''
        method      = pre_fields[6] if len(pre_fields) > 6 else ''

        shot_and_model_name_parts = pre_fields[7:]
        shot_and_model_name       = '_'.join(shot_and_model_name_parts)

        if '-' in shot_and_model_name:
            shot, model_name = shot_and_model_name.split('-', 1)
        else:
            shot = ''
            model_name = shot_and_model_name

        if model_name.startswith('models--'):
            model_name = model_name[len('models--'):]
            model_parts = model_name.split('--')
            model_name  = model_parts[-1]
        model_name = model_name.rstrip('_')

        return {
            'date': date,
            'time': time,
            'samples': samples,
            'shift': shift,
            'prompt_type': prompt_type,
            'method': method,
            'shot': shot,
            'model_name': model_name,
            'temperature': temperature,
            'max_token': max_token
        }

    @staticmethod
    def levenshtein_distance(s1: str, s2: str) -> int:
        """
        Compute the Levenshtein edit distance between two strings s1 and s2
        (minimum number of single-character edits).
        """
        if not s1:
            return len(s2)
        if not s2:
            return len(s1)

        len_s1 = len(s1)
        len_s2 = len(s2)

        dp = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

        for i in range(len_s1 + 1):
            dp[i][0] = i
        for j in range(len_s2 + 1):
            dp[0][j] = j

        for i in range(1, len_s1 + 1):
            for j in range(1, len_s2 + 1):
                cost = 0 if s1[i-1] == s2[j-1] else 1
                dp[i][j] = min(
                    dp[i-1][j] + 1,       # deletion
                    dp[i][j-1] + 1,       # insertion
                    dp[i-1][j-1] + cost   # substitution
                )

        return dp[len_s1][len_s2]

    def compute_per_record_metrics(self, cipher_text: str, gold_label: str) -> Dict[str, float]:
        """
        Compute per-record metrics:
          1) Levenshtein Error Rate: dist / len(gold_label)
          2) Character Error Rate:   1 - (# matched chars / len(gold_label))
          3) Word Accuracy (raw fraction for a single record)
          4) Sentence Accuracy: 1.0 if exact match else 0.0
        """
        gold_len = len(gold_label)
        
        # 1) Levenshtein Error Rate
        if gold_len > 0:
            dist = self.levenshtein_distance(cipher_text, gold_label)
            levenshtein_error_rate = dist / gold_len
            # Cap the Levenshtein Error Rate at 100 if it exceeds this threshold
            if levenshtein_error_rate > 1:
                levenshtein_error_rate = 1.0000
                dist = gold_len
        else:
            dist = 0
            levenshtein_error_rate = 0.0

        # 2) Character Error Rate
        matched_chars = 0
        for i in range(gold_len):
            if i < len(cipher_text) and cipher_text[i] == gold_label[i]:
                matched_chars += 1

        if gold_len > 0:
            character_error_rate = 1 - (matched_chars / gold_len)
        else:
            character_error_rate = 0.0

        # 3) Word Accuracy (unweighted, just per-record fraction)
        gold_words = gold_label.split()
        pred_words = cipher_text.split()
        matched_words = sum(gw == pw for gw, pw in zip(gold_words, pred_words))
        gold_words_len = len(gold_words)

        if gold_words_len > 0:
            word_accuracy = matched_words / gold_words_len
        else:
            word_accuracy = 0.0

        # 4) Sentence Accuracy
        sentence_accuracy = 1.0 if cipher_text == gold_label else 0.0

        return {
            'levenshtein_error_rate': levenshtein_error_rate,
            'character_error_rate': character_error_rate,
            'word_accuracy': word_accuracy,     # raw fraction for this record
            'sentence_accuracy': sentence_accuracy,
            # Additional for file-level weighting
            'levenshtein_distance': dist,
            'matched_chars': matched_chars,
            'gold_length': gold_len,
            'matched_words': matched_words,
            'gold_words_len': gold_words_len
        }

    def process_json_file(self, filepath: str, fields: Dict[str, str]) -> Optional[Dict[str, float]]:
        """
        Process a single JSON file:
         - For each record: compute per-record metrics and log them.
         - For file-level: 
             * Weighted word accuracy (using L^k).
             * Weighted Levenshtein and Character Error Rate.
             * Average Sentence Accuracy.
             * NEW: total matched words vs. total gold words => file-level average word accuracy.
        """
        filename = os.path.basename(filepath)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if not content:
                    logging.warning(f"Skipping empty file: {filename}")
                    return None
                data = json.loads(content)
        except (json.JSONDecodeError, FileNotFoundError):
            logging.error(f"Error reading or decoding JSON in file: {filename}")
            return None

        if not isinstance(data, list):
            logging.error(f"File {filename} does not contain a list of records.")
            return None

        # Accumulators for file-level (weighted or otherwise)
        total_levenshtein_distance = 0
        total_matched_chars = 0
        total_gold_chars = 0

        # Weighted Word Accuracy accumulators
        sum_weighted_word_contrib = 0.0
        sum_length_weights = 0.0

        # Sentence accuracy accumulators
        sum_sentence_accuracy = 0.0
        record_count = 0

        # Accumulators for average word accuracy
        total_matched_words = 0
        total_gold_words = 0

        # Start logging to the details file
        with open(self.config.output_details_file, 'a', encoding='utf-8') as out_f:
            out_f.write(f"=== Processing file: {filename} ===\n")
            out_f.write(f"Model name: {fields.get('model_name', '')}, Shift: {fields.get('shift', '')}, "
                        f"Prompt Type: {fields.get('prompt_type', '')}, Temperature: {fields.get('temperature', '')}, "
                        f"Max Token: {fields.get('max_token', '')}\n\n")

            # k exponent for weighting word accuracy
            k = self.word_weight_exponent

            # Process each record
            for idx, record in enumerate(data, start=1):
                cipher_text = record.get("cipher_text", "").strip()
                gold_label = record.get("gold_label", "").strip()

                if not cipher_text and not gold_label:
                    continue

                metrics = self.compute_per_record_metrics(cipher_text, gold_label)

                # Basic record-level fraction
                record_word_acc = metrics['word_accuracy']
                L_i = metrics['gold_words_len']

                # Weight factor = (L_i^k)
                length_weight = (L_i ** k)
                # Weighted contribution for word accuracy
                weighted_contribution = length_weight * record_word_acc

                # Accumulate for file-level
                total_levenshtein_distance += metrics['levenshtein_distance']
                total_matched_chars += metrics['matched_chars']
                total_gold_chars += metrics['gold_length']

                # Accumulate for average word accuracy
                total_matched_words += metrics['matched_words']
                total_gold_words += metrics['gold_words_len']

                # Weighted Word Accuracy sums
                sum_weighted_word_contrib += weighted_contribution
                sum_length_weights += length_weight

                sum_sentence_accuracy += metrics['sentence_accuracy']
                record_count += 1

                # Log per-record details
                out_f.write(f"Record #{idx}\n")
                out_f.write(f"  cipher_text: {cipher_text}\n")
                out_f.write(f"  gold_label:  {gold_label}\n")
                out_f.write(f"  Levenshtein Error Rate: {metrics['levenshtein_error_rate']:.4f}\n")
                out_f.write(f"  Character Error Rate:   {metrics['character_error_rate']:.4f}\n")
                out_f.write(f"  Word Accuracy (record, raw): {record_word_acc:.4f}\n")
                out_f.write(f"  Sentence Accuracy: {metrics['sentence_accuracy']:.4f}\n")

                # Print matched chars, matched words, gold chars, gold words
                out_f.write(f"  matched_characters: {metrics['matched_chars']}\n")
                out_f.write(f"  gold_characters:    {metrics['gold_length']}\n")
                out_f.write(f"  matched_words:      {metrics['matched_words']}\n")
                out_f.write(f"  gold_words_len:     {metrics['gold_words_len']}\n")

                # Show the weighting details
                out_f.write(f"  Word length (L_i): {L_i},  k={k},  Weight factor = L_i^k = {length_weight}\n")
                out_f.write(f"  Weighted Word Contribution = {length_weight:.2f} * {record_word_acc:.4f} = {weighted_contribution:.4f}\n\n")

            # Final file-level Levenshtein Error Rate
            if total_gold_chars > 0:
                file_levenshtein_error_rate = total_levenshtein_distance / total_gold_chars
            else:
                file_levenshtein_error_rate = 0.0

            # Final file-level Character Error Rate
            if total_gold_chars > 0:
                file_character_error_rate = 1 - (total_matched_chars / total_gold_chars)
            else:
                file_character_error_rate = 0.0

            # Final Weighted Word Accuracy
            if sum_length_weights > 0:
                file_weighted_word_accuracy = sum_weighted_word_contrib / sum_length_weights
            else:
                file_weighted_word_accuracy = 0.0

            # Sentence Accuracy (average)
            if record_count > 0:
                file_sentence_accuracy = sum_sentence_accuracy / record_count
            else:
                file_sentence_accuracy = 0.0

            # NEW: file-level average (unweighted) word accuracy
            if total_gold_words > 0:
                file_average_word_accuracy = total_matched_words / total_gold_words
            else:
                file_average_word_accuracy = 0.0

            # Write file summary
            out_f.write(f"--- Summary for file: {filename} ---\n")
            out_f.write(f"Total records processed: {record_count}\n\n")
            out_f.write(f"Weighted Levenshtein Error Rate (File-level):  {file_levenshtein_error_rate:.4f}\n")
            out_f.write(f"Weighted Character Error Rate (File-level):    {file_character_error_rate:.4f}\n")
            out_f.write(f"Weighted Word Accuracy (File-level, k={k}):     {file_weighted_word_accuracy:.4f}\n")
            out_f.write(f"Average Sentence Accuracy (File-level):        {file_sentence_accuracy:.4f}\n")
            out_f.write(f"Average Word Accuracy (File-level, unweighted): {file_average_word_accuracy:.4f}\n\n")

            # NEW: file-level totals
            out_f.write(f"File-level totals:\n")
            out_f.write(f"  total_characters_in_gold_label: {total_gold_chars}\n")
            out_f.write(f"  total_matched_characters:       {total_matched_chars}\n")
            out_f.write(f"  total_words_in_gold_label:      {total_gold_words}\n")
            out_f.write(f"  total_matched_words:            {total_matched_words}\n")
            out_f.write("==========================================\n\n")

        # Return a dictionary of file-level results (used for CSV)
        return {
            'filename': filename,
            'model_name': fields.get('model_name', ''),
            'shift': fields.get('shift', ''),
            'prompt_type': fields.get('prompt_type', ''),
            'temperature': fields.get('temperature', ''),
            'max_token': fields.get('max_token', ''),
            'levenshtein_error_rate': file_levenshtein_error_rate,
            'character_error_rate': file_character_error_rate,
            'weighted_word_accuracy': file_weighted_word_accuracy,
            'sentence_accuracy': file_sentence_accuracy,
            # Include the new average word accuracy in the CSV
            'avg_word_accuracy': file_average_word_accuracy
        }

    def process_directory(self):
        """
        Iterates over all .json files in the directory, processes each, 
        and accumulates final metrics for CSV output.
        """
        for filename in os.listdir(self.directory):
            if filename.endswith('.json'):
                filepath = os.path.join(self.directory, filename)
                filename_without_ext = filename[:-5]
                
                # Extract timestamp from filename to match with config
                timestamp = self.extract_timestamp(filename_without_ext)
                fields = self.config_data.get(timestamp)
                
                if not fields:
                    logging.warning(
                        f"Timestamp '{timestamp}' from filename '{filename_without_ext}' "
                        f"not found in config file {self.config_file}"
                    )
                    continue

                file_results = self.process_json_file(filepath, fields)
                if file_results:
                    self.file_level_results.append(file_results)

    def write_csv_results(self):
        """
        Writes the accumulated file-level metrics to a CSV file, including
        Weighted Word Accuracy (k=2), Average Word Accuracy, etc.
        """
        csv_file = self.config.csv_file_result

        def shift_as_int(val):
            try:
                return int(val['shift'])
            except ValueError:
                return 999999  # fallback if not numeric

        self.file_level_results.sort(key=lambda x: (x['model_name'], shift_as_int(x), x['prompt_type']))

        with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                'Filename', 
                'Model', 
                'Shift', 
                'Prompt Type', 
                'Temperature', 
                'Max Token',
                'Levenshtein Error Rate (%)', 
                'Character Error Rate (%)',
                'Weighted Word Accuracy (%)',
                'Sentence Accuracy (%)',
                'Average Word Accuracy (%)'
            ])
            for res in self.file_level_results:
                writer.writerow([
                    res['filename'],
                    res['model_name'],
                    res['shift'],
                    res['prompt_type'],
                    res['temperature'],
                    res['max_token'],
                    f"{res['levenshtein_error_rate'] * 100:.2f}",
                    f"{res['character_error_rate'] * 100:.2f}",
                    f"{res['weighted_word_accuracy'] * 100:.2f}",
                    f"{res['sentence_accuracy'] * 100:.2f}",
                    f"{res['avg_word_accuracy'] * 100:.2f}"
                ])
        logging.info(f"CSV results written to '{csv_file}'")

    def run(self):
        """
        Main entry point: processes the JSON files, then writes final CSV results.
        """
        self.process_directory()
        self.write_csv_results()
        logging.info(f"Processing finished.\n"
                     f"  - Detailed record-level results: '{self.config.output_details_file}'\n"
                     f"  - File-level summary CSV: '{self.config.csv_file_result}'")

if __name__ == '__main__':
    config = Config()
    caesar_test = CaesarCipherTest(config)
    caesar_test.run()
