In [1]:
import os
import pandas as pd
import numpy as np
import glob
from tqdm.notebook import tqdm
import re

In [2]:
# Function to find all TSV files recursively
def find_tsv_files(root_dir):
    tsv_files = []
    
    lang_pair_pattern = r'^[a-z]{2,4}\-[a-z]{2,4}$'
        
    try:
        subdirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
        lang_pair_dirs = [d for d in subdirs if re.match(lang_pair_pattern, d)]
        
        if lang_pair_dirs:
            # If language pair folders exist, search within them
            print(f"Found {len(lang_pair_dirs)} language pair folders: {', '.join(lang_pair_dirs)}")
            for lang_dir in lang_pair_dirs:
                lang_dir_path = os.path.join(root_dir, lang_dir)
                tsv_in_lang_dir = glob.glob(os.path.join(lang_dir_path, "*.tsv"))
                print(f"  Found {len(tsv_in_lang_dir)} TSV files in {lang_dir}")
                tsv_files.extend(tsv_in_lang_dir)
        else:
            # If no language pair folders, search in the current folder
            print(f"No language pair folders found. Searching in the root directory.")
            tsv_files = glob.glob(os.path.join(root_dir, "*.tsv"))
    except Exception as e:
        print(f"Error scanning directory: {str(e)}")
        tsv_files = glob.glob(os.path.join(root_dir, "*.tsv"))
    
    return tsv_files


In [3]:
def process_abnormal_lines(tsv_files):
    '''Processes the abnormal lines in tsv files: 
    lines that have more than default columns, lines that have \n and \t in any of the columns.
    '''
    # Process all files
    results = []
    
    for file_path in tqdm(tsv_files, desc="Processing files"):
        file_name = os.path.basename(file_path)
        parent_dir = os.path.basename(os.path.dirname(file_path))
        print(f"\nProcessing file: {file_name} (in {parent_dir})")
        
        # Read and analyze the first line to determine standard column count
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline().strip()
            standard_columns = len(first_line.split('\t'))
        
        print(f"Standard column count: {standard_columns}")
        
        # Read the entire file
        try:
            # Try standard reading
            df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
            print(f"File read successfully, columns: {len(df.columns)}")
        except Exception as e:
            print(f"Standard reading failed ({str(e)}), trying custom reading method...")
            # If standard reading fails, possibly due to inconsistent columns, try custom reading
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            # Manually parse rows
            rows = []
            for line in lines:
                fields = line.strip().split('\t')
                # Ensure each row has the same number of columns (pad with None if needed)
                padded_fields = fields + [None] * (standard_columns - len(fields))
                rows.append(padded_fields[:standard_columns])
            
            # Use the first row as column names
            columns = rows[0]
            data = rows[1:]
            df = pd.DataFrame(data, columns=columns)
            print(f"Custom reading successful, columns: {len(df.columns)}")
        
        # Detect abnormal rows
        abnormal_rows = []
        normal_rows = []
        
        # Check if rows have the standard number of columns
        for i, row in df.iterrows():
            is_abnormal = False
            
            # Check if any column contains \n or \t
            for col in row:
                if isinstance(col, str) and ('\n' in col or '\t' in col):
                    is_abnormal = True
                    break
            
            # Check if the number of non-null columns matches the standard
            non_null_columns = row.count()
            if non_null_columns != standard_columns:
                is_abnormal = True
            
            # Categorize based on check results
            if is_abnormal:
                abnormal_rows.append(row)
            else:
                normal_rows.append(row)
        
        # Convert to DataFrames
        abnormal_df = pd.DataFrame(abnormal_rows, columns=df.columns)
        normal_df = pd.DataFrame(normal_rows, columns=df.columns)
        
        # Get language pair name
        lang_pair = file_name.split('.')[1]  #Use the part after "train"
        
        # Create output directory (same as input directory)
        output_dir = os.path.dirname(file_path)
        
        # Save normal and abnormal rows
        normal_output = os.path.join(output_dir, f"{lang_pair}.tsv")
        abnormal_output = os.path.join(output_dir, f"abnormal_{lang_pair}.tsv")
        
        normal_df.to_csv(normal_output, sep='\t', index=False)
        abnormal_df.to_csv(abnormal_output, sep='\t', index=False)
        
        # Record results
        result = {
            "Filename": file_name,
            "Language Pair": lang_pair,
            "Original Rows": len(df),
            "Normal Rows": len(normal_df),
            "Abnormal Rows": len(abnormal_df),
            "Abnormal Percentage": f"{len(abnormal_df)/len(df)*100:.2f}%",
            "Normal Data File": normal_output,
            "Abnormal Data File": abnormal_output
        }
        results.append(result)
        
        print(f"Processing completed:")
        print(f"  Original rows: {len(df)}")
        print(f"  Normal rows: {len(normal_df)}")
        print(f"  Abnormal rows: {len(abnormal_df)}")
        print(f"  Abnormal percentage: {len(abnormal_df)/len(df)*100:.2f}%")
        print(f"  Normal data saved to: {normal_output}")
        print(f"  Abnormal data saved to: {abnormal_output}")
    
    # Display overall results
    results_df = pd.DataFrame(results)
    display(results_df)
    
    # Calculate total statistics
    total_original = results_df["Original Rows"].sum()
    total_normal = results_df["Normal Rows"].sum()
    total_abnormal = results_df["Abnormal Rows"].sum()
    
    print(f"\nOverall Statistics:")
    print(f"  Total original rows: {total_original}")
    print(f"  Total normal rows: {total_normal}")
    print(f"  Total abnormal rows: {total_abnormal}")
    print(f"  Overall abnormal percentage: {total_abnormal/total_original*100:.2f}%")
    
    # Generate a summary of processed data by language pair
    lang_pair_summary = {}
    for _, row in results_df.iterrows():
        lang_pair = row["Language Pair"]
        if lang_pair not in lang_pair_summary:
            lang_pair_summary[lang_pair] = {
                "Files": 0,
                "Original Rows": 0,
                "Normal Rows": 0,
                "Abnormal Rows": 0
            }
        
        lang_pair_summary[lang_pair]["Files"] += 1
        lang_pair_summary[lang_pair]["Original Rows"] += row["Original Rows"]
        lang_pair_summary[lang_pair]["Normal Rows"] += row["Normal Rows"]
        lang_pair_summary[lang_pair]["Abnormal Rows"] += row["Abnormal Rows"]
    
    # Convert to DataFrame and display
    summary_df = pd.DataFrame.from_dict(lang_pair_summary, orient='index')
    summary_df["Abnormal Percentage"] = (summary_df["Abnormal Rows"] / summary_df["Original Rows"] * 100).round(2).astype(str) + '%'
    print("\nSummary by Language Pair:")
    display(summary_df)
    return None

In [11]:
folder_path = "../data/wmt_qe_22_train" 

tsv_files = find_tsv_files(folder_path)
print(f"Total TSV files found: {len(tsv_files)}")

process_abnormal_lines(tsv_files)

No language pair folders found. Searching in the root directory.
Total TSV files found: 8


Processing files:   0%|          | 0/8 [00:00<?, ?it/s]


Processing file: train.enzh.df.short.tsv (in wmt_qe_22_train)
Standard column count: 8
Standard reading failed (Error tokenizing data. C error: Expected 8 fields in line 6172, saw 9
), trying custom reading method...
Custom reading successful, columns: 8
Processing completed:
  Original rows: 7000
  Normal rows: 7000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data saved to: ../data/wmt_qe_22_train/enzh.tsv
  Abnormal data saved to: ../data/wmt_qe_22_train/abnormal_enzh.tsv

Processing file: train.enmr.df.short.updated.tsv (in wmt_qe_22_train)
Standard column count: 7
File read successfully, columns: 7
Processing completed:
  Original rows: 26000
  Normal rows: 26000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data saved to: ../data/wmt_qe_22_train/enmr.tsv
  Abnormal data saved to: ../data/wmt_qe_22_train/abnormal_enmr.tsv

Processing file: train.ruen.df.short.tsv (in wmt_qe_22_train)
Standard column count: 8
File read successfully, columns: 8
Processing com

Unnamed: 0,Filename,Language Pair,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage,Normal Data File,Abnormal Data File
0,train.enzh.df.short.tsv,enzh,7000,7000,0,0.00%,../data/wmt_qe_22_train/enzh.tsv,../data/wmt_qe_22_train/abnormal_enzh.tsv
1,train.enmr.df.short.updated.tsv,enmr,26000,26000,0,0.00%,../data/wmt_qe_22_train/enmr.tsv,../data/wmt_qe_22_train/abnormal_enmr.tsv
2,train.ruen.df.short.tsv,ruen,7000,7000,0,0.00%,../data/wmt_qe_22_train/ruen.tsv,../data/wmt_qe_22_train/abnormal_ruen.tsv
3,train.roen.df.short.tsv,roen,6776,6774,2,0.03%,../data/wmt_qe_22_train/roen.tsv,../data/wmt_qe_22_train/abnormal_roen.tsv
4,train.eten.df.short.tsv,eten,7000,6996,4,0.06%,../data/wmt_qe_22_train/eten.tsv,../data/wmt_qe_22_train/abnormal_eten.tsv
5,train.neen.df.short.tsv,neen,7000,7000,0,0.00%,../data/wmt_qe_22_train/neen.tsv,../data/wmt_qe_22_train/abnormal_neen.tsv
6,train.sien.df.short.tsv,sien,7000,7000,0,0.00%,../data/wmt_qe_22_train/sien.tsv,../data/wmt_qe_22_train/abnormal_sien.tsv
7,train.ende.df.short.tsv,ende,7000,6992,8,0.11%,../data/wmt_qe_22_train/ende.tsv,../data/wmt_qe_22_train/abnormal_ende.tsv



Overall Statistics:
  Total original rows: 74776
  Total normal rows: 74762
  Total abnormal rows: 14
  Overall abnormal percentage: 0.02%

Summary by Language Pair:


Unnamed: 0,Files,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage
enzh,1,7000,7000,0,0.0%
enmr,1,26000,26000,0,0.0%
ruen,1,7000,7000,0,0.0%
roen,1,6776,6774,2,0.03%
eten,1,7000,6996,4,0.06%
neen,1,7000,7000,0,0.0%
sien,1,7000,7000,0,0.0%
ende,1,7000,6992,8,0.11%


In [20]:
folder_path2 = "../data/wmt_qe_2023_data_task1_da" 
tsv_files_23 = find_tsv_files(folder_path2)


Found 4 language pair folders: en-ta, en-hi, en-te, en-gu
  Found 2 TSV files in en-ta
  Found 2 TSV files in en-hi
  Found 2 TSV files in en-te
  Found 2 TSV files in en-gu


In [22]:
process_abnormal_lines(tsv_files_23)

Processing files:   0%|          | 0/8 [00:00<?, ?it/s]


Processing file: dev.enta.df.short.tsv (in en-ta)
Standard column count: 7
File read successfully, columns: 7
Processing completed:
  Original rows: 1000
  Normal rows: 1000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data saved to: ../data/wmt_qe_2023_data_task1_da/en-ta/enta.tsv
  Abnormal data saved to: ../data/wmt_qe_2023_data_task1_da/en-ta/abnormal_enta.tsv

Processing file: train.enta.df.short.tsv (in en-ta)
Standard column count: 7
File read successfully, columns: 7
Processing completed:
  Original rows: 7000
  Normal rows: 7000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data saved to: ../data/wmt_qe_2023_data_task1_da/en-ta/enta.tsv
  Abnormal data saved to: ../data/wmt_qe_2023_data_task1_da/en-ta/abnormal_enta.tsv

Processing file: train.enhi.df.short.tsv (in en-hi)
Standard column count: 7
File read successfully, columns: 7
Processing completed:
  Original rows: 7000
  Normal rows: 7000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data

Unnamed: 0,Filename,Language Pair,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage,Normal Data File,Abnormal Data File
0,dev.enta.df.short.tsv,enta,1000,1000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-ta/enta.tsv,../data/wmt_qe_2023_data_task1_da/en-ta/abnorm...
1,train.enta.df.short.tsv,enta,7000,7000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-ta/enta.tsv,../data/wmt_qe_2023_data_task1_da/en-ta/abnorm...
2,train.enhi.df.short.tsv,enhi,7000,7000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-hi/enhi.tsv,../data/wmt_qe_2023_data_task1_da/en-hi/abnorm...
3,dev.enhi.df.short.tsv,enhi,1000,1000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-hi/enhi.tsv,../data/wmt_qe_2023_data_task1_da/en-hi/abnorm...
4,train.ente.df.short.tsv,ente,7000,7000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-te/ente.tsv,../data/wmt_qe_2023_data_task1_da/en-te/abnorm...
5,dev.ente.df.short.tsv,ente,1000,1000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-te/ente.tsv,../data/wmt_qe_2023_data_task1_da/en-te/abnorm...
6,train.engu.df.short.tsv,engu,7000,7000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-gu/engu.tsv,../data/wmt_qe_2023_data_task1_da/en-gu/abnorm...
7,dev.engu.df.short.tsv,engu,1000,1000,0,0.00%,../data/wmt_qe_2023_data_task1_da/en-gu/engu.tsv,../data/wmt_qe_2023_data_task1_da/en-gu/abnorm...



Overall Statistics:
  Total original rows: 32000
  Total normal rows: 32000
  Total abnormal rows: 0
  Overall abnormal percentage: 0.00%

Summary by Language Pair:


Unnamed: 0,Files,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage
enta,2,8000,8000,0,0.0%
enhi,2,8000,8000,0,0.0%
ente,2,8000,8000,0,0.0%
engu,2,8000,8000,0,0.0%


In [27]:
#PROBABLY NOT NEEDED

def combine_fixed_tsv(fixed_file, original_file):
    """
    Combine manually fixed TSV data with original normal data.
    
    Args:
        fixed_file (str): Path to the manually fixed TSV file
        original_file (str): Path to the original normal TSV file
        
    Returns:
        str: Path to the combined output file
    """
    import pandas as pd
    import os
    import chardet
    
    print(f"Combining fixed file '{fixed_file}' with original file '{original_file}'")
    
    try:
        # Detect file encodings
        def detect_encoding(file_path):
            with open(file_path, 'rb') as f:
                raw_data = f.read(10000)  # Read first 10000 bytes to detect encoding
                result = chardet.detect(raw_data)
                encoding = result['encoding']
                confidence = result['confidence']
                print(f"Detected encoding for {os.path.basename(file_path)}: {encoding} (confidence: {confidence:.2f})")
                return encoding
        
        fixed_encoding = detect_encoding(fixed_file)
        original_encoding = detect_encoding(original_file)
        
        # Read files with detected encodings
        print(f"Reading fixed file with {fixed_encoding} encoding")
        fixed_df = pd.read_csv(fixed_file, sep='\t', encoding=fixed_encoding, engine='python')
        
        print(f"Reading original file with {original_encoding} encoding")
        original_df = pd.read_csv(original_file, sep='\t', encoding=original_encoding, engine='python')
        
        # Get column count information
        original_cols = original_df.columns.tolist()
        fixed_cols = fixed_df.columns.tolist()
        
        print(f"Original file: {len(original_df)} rows, {len(original_cols)} columns")
        print(f"Fixed file: {len(fixed_df)} rows, {len(fixed_cols)} columns")
        
        # Check if column names match
        cols_match = set(original_cols) == set(fixed_cols)
        if not cols_match:
            print("WARNING: Column names don't match exactly. Proceeding with column alignment.")
            print(f"Original columns: {original_cols}")
            print(f"Fixed columns: {fixed_cols}")
            
            # If column names don't match, try to align columns 
            # Use original column names as the standard
            common_cols = [col for col in original_cols if col in fixed_cols]
            missing_cols = [col for col in original_cols if col not in fixed_cols]
            
            if missing_cols:
                print(f"Columns missing in fixed file: {missing_cols}")
                # Add missing columns to fixed_df
                for col in missing_cols:
                    fixed_df[col] = None
            
            # Ensure fixed_df has same column order as original_df
            try:
                fixed_df = fixed_df[original_cols]
            except KeyError as e:
                print(f"Error aligning columns: {e}")
                print("Attempting to copy columns by position instead of name...")
                
                # Try a different approach - copy data by position
                if len(fixed_df.columns) == len(original_df.columns):
                    fixed_df.columns = original_df.columns
                else:
                    # Create a new DataFrame with the right columns
                    new_fixed_df = pd.DataFrame(columns=original_cols)
                    for i, col in enumerate(fixed_df.columns):
                        if i < len(original_cols):
                            new_fixed_df[original_cols[i]] = fixed_df[col]
                    fixed_df = new_fixed_df
            
        # Combine the dataframes
        combined_df = pd.concat([original_df, fixed_df], ignore_index=True)
        
        # Generate output file path with the naming convention: combined_<original file name>
        base_dir = os.path.dirname(original_file)
        original_file_name = os.path.basename(original_file)
        output_file = os.path.join(base_dir, f"combined_{original_file_name}")
        
        # Save combined dataframe with UTF-8 encoding
        combined_df.to_csv(output_file, sep='\t', index=False, encoding='utf-8')
        
        print(f"Combined file saved to: {output_file}")
        print(f"Total rows in combined file: {len(combined_df)}")
        
        return output_file
    
    except Exception as e:
        print(f"Error combining files: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [6]:
folder_path = "../data/wmt21_train" 

In [7]:
tsv_files = find_tsv_files(folder_path)
print(f"Total TSV files found: {len(tsv_files)}")

process_abnormal_lines(tsv_files)

No language pair folders found. Searching in the root directory.
Total TSV files found: 7


Processing files:   0%|          | 0/7 [00:00<?, ?it/s]


Processing file: train.enzh.df.short.tsv (in wmt21_train)
Standard column count: 8
Standard reading failed (Error tokenizing data. C error: Expected 8 fields in line 6172, saw 9
), trying custom reading method...
Custom reading successful, columns: 8
Processing completed:
  Original rows: 7000
  Normal rows: 7000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data saved to: ../data/wmt21_train/enzh.tsv
  Abnormal data saved to: ../data/wmt21_train/abnormal_enzh.tsv

Processing file: train.ruen.df.short.tsv (in wmt21_train)
Standard column count: 8
File read successfully, columns: 8
Processing completed:
  Original rows: 7000
  Normal rows: 7000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data saved to: ../data/wmt21_train/ruen.tsv
  Abnormal data saved to: ../data/wmt21_train/abnormal_ruen.tsv

Processing file: train.roen.df.short.tsv (in wmt21_train)
Standard column count: 8
File read successfully, columns: 8
Processing completed:
  Original rows: 6776
  Normal

Unnamed: 0,Filename,Language Pair,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage,Normal Data File,Abnormal Data File
0,train.enzh.df.short.tsv,enzh,7000,7000,0,0.00%,../data/wmt21_train/enzh.tsv,../data/wmt21_train/abnormal_enzh.tsv
1,train.ruen.df.short.tsv,ruen,7000,7000,0,0.00%,../data/wmt21_train/ruen.tsv,../data/wmt21_train/abnormal_ruen.tsv
2,train.roen.df.short.tsv,roen,6776,6774,2,0.03%,../data/wmt21_train/roen.tsv,../data/wmt21_train/abnormal_roen.tsv
3,train.eten.df.short.tsv,eten,7000,6996,4,0.06%,../data/wmt21_train/eten.tsv,../data/wmt21_train/abnormal_eten.tsv
4,train.neen.df.short.tsv,neen,7000,7000,0,0.00%,../data/wmt21_train/neen.tsv,../data/wmt21_train/abnormal_neen.tsv
5,train.sien.df.short.tsv,sien,7000,7000,0,0.00%,../data/wmt21_train/sien.tsv,../data/wmt21_train/abnormal_sien.tsv
6,train.ende.df.short.tsv,ende,7000,6992,8,0.11%,../data/wmt21_train/ende.tsv,../data/wmt21_train/abnormal_ende.tsv



Overall Statistics:
  Total original rows: 48776
  Total normal rows: 48762
  Total abnormal rows: 14
  Overall abnormal percentage: 0.03%

Summary by Language Pair:


Unnamed: 0,Files,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage
enzh,1,7000,7000,0,0.0%
ruen,1,7000,7000,0,0.0%
roen,1,6776,6774,2,0.03%
eten,1,7000,6996,4,0.06%
neen,1,7000,7000,0,0.0%
sien,1,7000,7000,0,0.0%
ende,1,7000,6992,8,0.11%


In [8]:
folder_path2 = "../data/wmt20_train" 
tsv_files = find_tsv_files(folder_path2)
print(f"Total TSV files found: {len(tsv_files)}")

process_abnormal_lines(tsv_files)

No language pair folders found. Searching in the root directory.
Total TSV files found: 6


Processing files:   0%|          | 0/6 [00:00<?, ?it/s]


Processing file: train.enzh.df.short.tsv (in wmt20_train)
Standard column count: 8
Standard reading failed (Error tokenizing data. C error: Expected 8 fields in line 6172, saw 9
), trying custom reading method...
Custom reading successful, columns: 8
Processing completed:
  Original rows: 7000
  Normal rows: 7000
  Abnormal rows: 0
  Abnormal percentage: 0.00%
  Normal data saved to: ../data/wmt20_train/enzh.tsv
  Abnormal data saved to: ../data/wmt20_train/abnormal_enzh.tsv

Processing file: train.roen.df.short.tsv (in wmt20_train)
Standard column count: 8
File read successfully, columns: 8
Processing completed:
  Original rows: 6776
  Normal rows: 6774
  Abnormal rows: 2
  Abnormal percentage: 0.03%
  Normal data saved to: ../data/wmt20_train/roen.tsv
  Abnormal data saved to: ../data/wmt20_train/abnormal_roen.tsv

Processing file: train.eten.df.short.tsv (in wmt20_train)
Standard column count: 8
File read successfully, columns: 8
Processing completed:
  Original rows: 7000
  Normal

Unnamed: 0,Filename,Language Pair,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage,Normal Data File,Abnormal Data File
0,train.enzh.df.short.tsv,enzh,7000,7000,0,0.00%,../data/wmt20_train/enzh.tsv,../data/wmt20_train/abnormal_enzh.tsv
1,train.roen.df.short.tsv,roen,6776,6774,2,0.03%,../data/wmt20_train/roen.tsv,../data/wmt20_train/abnormal_roen.tsv
2,train.eten.df.short.tsv,eten,7000,6996,4,0.06%,../data/wmt20_train/eten.tsv,../data/wmt20_train/abnormal_eten.tsv
3,train.neen.df.short.tsv,neen,7000,7000,0,0.00%,../data/wmt20_train/neen.tsv,../data/wmt20_train/abnormal_neen.tsv
4,train.sien.df.short.tsv,sien,7000,7000,0,0.00%,../data/wmt20_train/sien.tsv,../data/wmt20_train/abnormal_sien.tsv
5,train.ende.df.short.tsv,ende,7000,6992,8,0.11%,../data/wmt20_train/ende.tsv,../data/wmt20_train/abnormal_ende.tsv



Overall Statistics:
  Total original rows: 41776
  Total normal rows: 41762
  Total abnormal rows: 14
  Overall abnormal percentage: 0.03%

Summary by Language Pair:


Unnamed: 0,Files,Original Rows,Normal Rows,Abnormal Rows,Abnormal Percentage
enzh,1,7000,7000,0,0.0%
roen,1,6776,6774,2,0.03%
eten,1,7000,6996,4,0.06%
neen,1,7000,7000,0,0.0%
sien,1,7000,7000,0,0.0%
ende,1,7000,6992,8,0.11%
