In [24]:
import json
import pandas as pd
from sacrebleu.metrics import BLEU
import glob
import os

In [28]:
jsonl_files = glob.glob('*.jsonl')
print(f"Found {len(jsonl_files)} JSONL files")

if len(jsonl_files) == 0:
    print("No JSONL files found. Make sure you extracted the ZIP file to this folder.")

Found 37 JSONL files


In [21]:
def get_translation_from_sample(sample):
    if 'teacher-CoT-translation' in sample:
        return sample['teacher-CoT-translation'], 'Teacher-CoT'
    elif 'self-CoT-translation' in sample:
        return sample['self-CoT-translation'], 'Self-CoT'
    elif 'direct-translation' in sample:
        return sample['direct-translation'], 'Direct'
    return None, None

def is_empty_translation(translation):
    if translation is None:
        return True
    if isinstance(translation, str) and translation.strip() == '':
        return True
    return False

def compute_bleu(hypothesis, references):
    bleu = BLEU()
    score = bleu.sentence_score(hypothesis, references)
    return score.score


In [22]:
def analyze_file(filepath):
    print(f"\nAnalyzing: {filepath}")
    
    data = load_jsonl(filepath)
    
    empty_count = 0
    bleu_scores = []
    method = None
    
    for sample in data:
        translation, current_method = get_translation_from_sample(sample)
        
        if method is None:
            method = current_method
        
        if is_empty_translation(translation):
            empty_count += 1
            continue
        
        references = []
        if 'reference' in sample and sample['reference']:
            references.append(sample['reference'])
        if 'reference2' in sample and sample['reference2']:
            references.append(sample['reference2'])

        if not references:
            continue
        
        try:
            bleu_score = compute_bleu(translation, references)
            bleu_scores.append(bleu_score)
        except Exception as e:
            continue

    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    
    result = {
        'filename': filepath,
        'method': method,
        'empty_count': empty_count,
        'avg_bleu': round(avg_bleu, 2),
        'num_valid_samples': len(bleu_scores),
        'total_samples': len(data)
    }
    
    print(f"  Method: {result['method']}")
    print(f"  Empty translations: {result['empty_count']}/{result['total_samples']}")
    print(f"  Average BLEU: {result['avg_bleu']}")
    print(f"  Valid samples: {result['num_valid_samples']}")
    
    return result


In [23]:
all_results = []

for filepath in jsonl_files:
    result = analyze_file(filepath)
    all_results.append(result)

It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 


Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B__Self-CoT__NA_es-en.jsonl
  Method: Self-CoT
  Empty translations: 26/50
  Average BLEU: 9.47
  Valid samples: 24

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Self-CoT__NA_en-es.jsonl
  Method: Self-CoT
  Empty translations: 14/50
  Average BLEU: 2.32
  Valid samples: 36

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Teacher-CoT__Qwen_Qwen3-32B_fr-en.jsonl
  Method: Teacher-CoT
  Empty translations: 26/50
  Average BLEU: 30.23
  Valid samples: 24

Analyzing: student_Qwen_Qwen3-8B__Teacher-CoT__Qwen_Qwen3-32B_en-es.jsonl
  Method: Teacher-CoT
  Empty translations: 9/50
  Average BLEU: 3.26
  Valid samples: 41

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Direct__NA_es-en.jsonl
  Method: None
  Empty translations: 50/50
  Average BLEU: 0
  Valid samples: 0

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B__Teacher-CoT__Qwen_Qwen3-32B_es-en.jsonl
  Method: Teacher-CoT


It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 

  Method: Teacher-CoT
  Empty translations: 25/50
  Average BLEU: 3.68
  Valid samples: 25

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-7B__Direct__NA_es-en.jsonl
  Method: None
  Empty translations: 50/50
  Average BLEU: 0
  Valid samples: 0

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B__Direct__NA_fr-en.jsonl
  Method: None
  Empty translations: 50/50
  Average BLEU: 0
  Valid samples: 0

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Direct__NA_en-es.jsonl
  Method: None
  Empty translations: 50/50
  Average BLEU: 0
  Valid samples: 0

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-7B__Teacher-CoT__Qwen_Qwen3-32B_fr-en.jsonl
  Method: Teacher-CoT
  Empty translations: 1/50
  Average BLEU: 35.26
  Valid samples: 49

Analyzing: student_Qwen_Qwen3-8B__Self-CoT__NA_fr-en.jsonl
  Method: Self-CoT
  Empty translations: 10/50
  Average BLEU: 31.97
  Valid samples: 40

Analyzing: student_Qwen_Qwen3-8B__Teacher-CoT__Qwen_Qwen3-32B_es-en.json

It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 

  Method: Self-CoT
  Empty translations: 12/50
  Average BLEU: 31.29
  Valid samples: 38

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B__Self-CoT__NA_en-es.jsonl
  Method: Self-CoT
  Empty translations: 33/50
  Average BLEU: 4.23
  Valid samples: 17

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-7B__Direct__NA_fr-en.jsonl
  Method: None
  Empty translations: 50/50
  Average BLEU: 0
  Valid samples: 0

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Teacher-CoT__Qwen_Qwen3-32B_es-en.jsonl
  Method: Teacher-CoT
  Empty translations: 18/50
  Average BLEU: 40.21
  Valid samples: 32

Analyzing: student_deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B__Self-CoT__NA_fr-en.jsonl
  Method: Self-CoT
  Empty translations: 35/50
  Average BLEU: 7.6
  Valid samples: 15

Analyzing: student_Qwen_Qwen3-8B__Teacher-Synthesized-CoT__Qwen_Qwen3-32B_fr-en.jsonl
  Method: None
  Empty translations: 50/50
  Average BLEU: 0
  Valid samples: 0

Analyzing: student_deepseek-ai_Dee

In [19]:
df_results = pd.DataFrame(all_results)

df_results = df_results.sort_values('filename')

print("Results Summary:")
print(df_results[['filename', 'method', 'empty_count', 'avg_bleu', 'num_valid_samples']])


Results Summary:
                                             filename       method  \
36      student_Qwen_Qwen3-8B__Direct__NA_en-es.jsonl         None   
11      student_Qwen_Qwen3-8B__Direct__NA_es-en.jsonl         None   
22      student_Qwen_Qwen3-8B__Direct__NA_fr-en.jsonl         None   
33    student_Qwen_Qwen3-8B__Self-CoT__NA_en-es.jsonl     Self-CoT   
9     student_Qwen_Qwen3-8B__Self-CoT__NA_es-en.jsonl     Self-CoT   
18    student_Qwen_Qwen3-8B__Self-CoT__NA_fr-en.jsonl     Self-CoT   
3   student_Qwen_Qwen3-8B__Teacher-CoT__Qwen_Qwen3...  Teacher-CoT   
19  student_Qwen_Qwen3-8B__Teacher-CoT__Qwen_Qwen3...  Teacher-CoT   
7   student_Qwen_Qwen3-8B__Teacher-CoT__Qwen_Qwen3...  Teacher-CoT   
28  student_Qwen_Qwen3-8B__Teacher-Synthesized-CoT...         None   
16  student_deepseek-ai_DeepSeek-R1-Distill-Llama-...         None   
4   student_deepseek-ai_DeepSeek-R1-Distill-Llama-...         None   
31  student_deepseek-ai_DeepSeek-R1-Distill-Llama-...         None   
1  

In [None]:
def extract_lang_pair(filename):
    parts = filename.replace('.jsonl', '').split('_')
    for part in parts:
        if '-' in part and len(part) <= 6:
            return part
    return 'unknown'

df_results['lang_pair'] = df_results['filename'].apply(extract_lang_pair)

print("\nComparison by Language Pair:")
print("="*50)

for lang_pair in sorted(df_results['lang_pair'].unique()):
    subset = df_results[df_results['lang_pair'] == lang_pair]
    
    if len(subset) > 0:
        print(f"\n{lang_pair.upper()}:")
        for _, row in subset.iterrows():
            method_name = row['method'] if row['method'] else 'Direct'
            print(f"  {method_name}: BLEU = {row['avg_bleu']}, Empty = {row['empty_count']}")
        
        valid_subset = subset[subset['num_valid_samples'] > 0]
        if len(valid_subset) > 0:
            best = valid_subset.loc[valid_subset['avg_bleu'].idxmax()]
            best_method = best['method'] if best['method'] else 'Direct'
            print(f"  Best method: {best_method} (BLEU: {best['avg_bleu']})")


Comparison by Language Pair:

EN-ES:
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 0.0, Empty = 50
  Teacher-CoT: BLEU = 3.26, Empty = 9
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 2.32, Empty = 14
  Teacher-CoT: BLEU = 3.68, Empty = 25
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 4.23, Empty = 33
  Teacher-CoT: BLEU = 5.16, Empty = 25
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 5.42, Empty = 29
  Teacher-CoT: BLEU = 3.11, Empty = 1
  Best method: Self-CoT (BLEU: 5.42)

ES-EN:
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 0.0, Empty = 50
  Teacher-CoT: BLEU = 36.28, Empty = 5
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 31.29, Empty = 12
  Teacher-CoT: BLEU = 40.21, Empty = 18
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 9.47, Empty = 26
  Teacher-CoT: BLEU = 30.53, Empty = 27
  Direct: BLEU = 0.0, Empty = 50
  Self-CoT: BLEU = 22.75, Empty = 29
  Teacher-CoT: BLEU = 36.28, Empty = 1
  Best method: Teacher-CoT (BLEU: 40.21)

FR-EN:
  Di

In [15]:
print("\nOVERALL STATISTICS")
print("="*50)
print(f"Total files analyzed: {len(df_results)}")
print(f"Total empty translations: {df_results['empty_count'].sum()}")
print(f"Average BLEU across all files: {df_results['avg_bleu'].mean():.2f}")

print("\nBy Method:")
method_stats = df_results.groupby('method').agg({
    'avg_bleu': 'mean',
    'empty_count': 'sum',
    'num_valid_samples': 'sum'
}).round(2)
print(method_stats)

print("\nFiles with valid translations (excluding all-empty files):")
valid_files = df_results[df_results['num_valid_samples'] > 0]
print(f"Count: {len(valid_files)}")
print(f"Average BLEU for valid files: {valid_files['avg_bleu'].mean():.2f}")


OVERALL STATISTICS
Total files analyzed: 37
Total empty translations: 1143
Average BLEU across all files: 12.34

By Method:
             avg_bleu  empty_count  num_valid_samples
method                                               
Self-CoT        14.00          323                277
Teacher-CoT     24.03          170                430

Files with valid translations (excluding all-empty files):
Count: 22
Average BLEU for valid files: 20.75


In [None]:
df_results.to_csv('results_summary.csv', index=False)
print("Results saved to results_summary.csv")

with open('findings.txt', 'w') as f:
    f.write("TRANSLATION QUALITY ANALYSIS\n")
    f.write("="*50 + "\n\n")
    
    f.write(f"Total files analyzed: {len(df_results)}\n")
    f.write(f"Total empty translations: {df_results['empty_count'].sum()}\n")
    f.write(f"Average BLEU across all files: {df_results['avg_bleu'].mean():.2f}\n\n")
    
    f.write("By Method:\n")
    f.write(str(method_stats))
    f.write("\n\n")
    
    for lang_pair in sorted(df_results['lang_pair'].unique()):
        subset = df_results[df_results['lang_pair'] == lang_pair]
        f.write(f"\n{lang_pair.upper()}:\n")
        f.write("-"*30 + "\n")
        
        for _, row in subset.iterrows():
            method_name = row['method'] if row['method'] else 'Direct'
            f.write(f"{method_name}: BLEU={row['avg_bleu']}, Empty={row['empty_count']}\n")
        
        valid_subset = subset[subset['num_valid_samples'] > 0]
        if len(valid_subset) > 0:
            best = valid_subset.loc[valid_subset['avg_bleu'].idxmax()]
            best_method = best['method'] if best['method'] else 'Direct'
            f.write(f"Best: {best_method} (BLEU: {best['avg_bleu']})\n")

print("Findings saved to findings.txt")

Results saved to results_summary.csv
Findings saved to findings.txt

All done! Check findings.txt for your summary.
