In [1]:
import pandas as pd
import os
import openai
from tqdm.notebook import tqdm  # Use the notebook-friendly version of tqdm
import concurrent.futures

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='key.env')     #Get api key from .env file

True

In [3]:
# --- API Initialization ---
GPT_MODEL = "o4-mini"
try:
    client = openai.OpenAI()
    print("OpenAI client initialized successfully.")
except openai.OpenAIError:
    print("Error: OpenAI API key not found. Make sure it's set as an environment variable.")


OpenAI client initialized successfully.


In [7]:
# --- Create a list of dictionaries, one for each file to process ---
files_to_process = [
    # {
    #     'name': 'Training Set',
    #     'path': 'zmean_train_80_with_gpt_reasonings.tsv'
    # },
    # {
    #     'name': 'Development Set',
    #     'path': 'zmean_dev_10_with_gpt_reasonings.tsv' # Assuming this is the dev set path
    # },
    {
        'name': 'Test Set',
        'path': 'new_portion_processed/zmean_test_10.tsv'
    }
    # You can add more files here in the future
]

print(f"Configured to process {len(files_to_process)} files.")

Configured to process 1 files.


In [8]:
# Cell 3: Define Helper Functions

# --- The System Prompt ---
SYSTEM_PROMPT = """
You are an annotator for the quality of machine translation. You have a pair of sentences and a score indicating the translation quality. Your task is to identify errors, assess the quality of the translation, and explain why the translation gets this score.
The score is in range [-2, 2], -2 signifies that the translation is at the worst quality, inhibiting comprehension of the text; 2 signifies that the translation is at the best quality, fluent and accurate, used proper terms. The scores in between indicates major or minor errors: Major errors disrupt the flow, but what the text is trying to say is still understandable. Minor errors are technically errors, but they do not disrupt the flow or hinder comprehension. The score's position within the -2 to 2 range indicates subtle, yet significant, differences in translation quality.
Your response must be direct and concise. Do not include any headers (like 'Assessment' or 'Reasoning'), bullet points, or concluding summary paragraphs (like 'Overall, ...'). Output only the core reasoning as a single, dense paragraph.
"""

# --- The Worker Function for multithreading ---
def get_explanation_for_row(row_data):
    index, row = row_data
    src, mt, score = row['src'], row['mt'], row['zmean']
    user_prompt = f"Source: {src}\nTranslation: {mt}\nScore:{score}"
    try:
        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}],
            max_completion_tokens=3500
        )
        return index, response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error on index {index}: {e}")
        return index, None

# --- Main logic wrapped in functions ---
def analyze_and_report(file_path):
    """Loads a file and prints a report of missing entries."""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}. Skipping.")
        return None, None
        
    df = pd.read_csv(file_path, sep='\t')
    missing_df = df[df['gpt_explanation'].isna()].copy()
    
    print(f"  Total rows: {len(df)}")
    print(f"  Missing explanations: {len(missing_df)}")
    
    return df, missing_df

def fix_missing_entries(df, missing_df, file_path):
    """Takes a dataframe with missing entries and fixes them."""
    rows_to_process = list(missing_df.iterrows())
    MAX_WORKERS = 50

    print(f"  Starting to fetch {len(rows_to_process)} missing explanations with {MAX_WORKERS} workers...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results_iterator = executor.map(get_explanation_for_row, rows_to_process)
        for index, explanation in tqdm(results_iterator, total=len(rows_to_process), desc="  Fixing entries"):
            if explanation:
                df.loc[index, 'gpt_explanation'] = explanation
    
    # --- Final verification and save ---
    final_missing_count = df['gpt_explanation'].isna().sum()
    print(f"\n  Fixing process complete. Remaining missing rows: {final_missing_count}")
    print(f"  Saving updated data back to: {file_path}")
    df.to_csv(file_path, sep='\t', index=False)
    print("  File saved successfully.")

print("Helper functions are defined and ready.")

Helper functions are defined and ready.


In [9]:
# Cell 4: Main Execution Loop

for file_info in files_to_process:
    print("\n" + "="*60)
    print(f"Processing: {file_info['name']} ({file_info['path']})")
    print("="*60)
    
    # Step 1: Analyze the file and get the report
    df, missing_df = analyze_and_report(file_info['path'])
    
    # Step 2: If there are missing rows, fix them
    if df is not None and not missing_df.empty:
        fix_missing_entries(df, missing_df, file_info['path'])
    elif df is not None:
        print("  🎉 No missing entries to fix for this file.")

print("\n" + "="*60)
print("All configured files have been processed.")
print("="*60)


Processing: Test Set (new_portion_processed/zmean_test_10.tsv)
  Total rows: 14477
  Missing explanations: 2
  Starting to fetch 2 missing explanations with 50 workers...


  Fixing entries:   0%|          | 0/2 [00:00<?, ?it/s]


  Fixing process complete. Remaining missing rows: 0
  Saving updated data back to: new_portion_processed/zmean_test_10.tsv
  File saved successfully.

All configured files have been processed.


In [10]:
print("--- Starting Final Validation Check ---")

# This flag will help us summarize the final result
all_files_ok = True

# We reuse the 'files_to_process' list you defined in a previous cell.
# This loop will check each file in that list.
for file_info in files_to_process:
    file_path = file_info['path']
    file_name = file_info['name']
    
    print(f"\nVerifying file: '{file_name}'...")

    # Check if the file exists before trying to read it
    if not os.path.exists(file_path):
        print(f"   ERROR: File not found at {file_path}")
        all_files_ok = False
        continue  # Move to the next file in the list

    try:
        # Load the data file
        df = pd.read_csv(file_path, sep='\t')
        
        # Check for any null/empty values in the 'gpt_explanation' column
        # .isna() detects NaN, None, etc. .sum() counts them.
        missing_count = df['gpt_explanation'].isna().sum()

        # Report the result for the current file
        if missing_count == 0:
            print(f"   SUCCESS: No empty 'gpt_explanation' entries found.")
        else:
            print(f"   WARNING: Found {missing_count} empty 'gpt_explanation' entries in this file.")
            all_files_ok = False # Mark that at least one file has issues
            
    except Exception as e:
        print(f"   ERROR: Could not read or process the file. Error: {e}")
        all_files_ok = False

# --- Print a final summary message ---
print("\n" + "="*50)
if all_files_ok:
    print(" Validation complete. All specified files look good!")
else:
    print("Validation complete. One or more files have issues or were not found.")
print("="*50)


--- Starting Final Validation Check ---

Verifying file: 'Test Set'...
   SUCCESS: No empty 'gpt_explanation' entries found.

 Validation complete. All specified files look good!


In [13]:
# Cut out 500 lines and make a toy set with reasoning
# --- Configuration ---
# 1. Define the path to your full training set file with reasonings.
#    Please verify this path is correct relative to your notebook's location.
input_file_path = 'zmean_train_80_with_gpt_reasonings.tsv'

# 2. Define the path for the new toy dataset you want to create.
output_file_path = 'toy_train_with_reasoning.tsv'

# --- Main Logic ---
print(f"Reading full dataset from: {input_file_path}")

# Check if the input file exists before proceeding

try:
    # Load the entire dataset from the TSV file
    full_df = pd.read_csv(input_file_path, sep='\t')
    print(f" Full dataset loaded successfully. Total rows: {len(full_df)}")

    # Check if the dataset has at least 500 rows
    if len(full_df) >= 500:
        # Slice the first 500 rows to create the toy dataset
        toy_df = full_df.head(500)
        print(f"Sliced the first 500 rows to create the toy dataset.")

        # Save the new toy DataFrame to a new TSV file
        # index=False prevents pandas from writing row numbers into the file
        toy_df.to_csv(output_file_path, sep='\t', index=False)
        
        print("-" * 50)
        print(f" Success! New toy dataset saved to: {output_file_path}")
        print(f"   It contains {len(toy_df)} rows.")
        print("-" * 50)
    else:
        print(f" Warning: The source file has only {len(full_df)} rows, which is less than 500. A full toy set could not be created.")

except Exception as e:
    print(f" An error occurred while processing the file: {e}")

Reading full dataset from: zmean_train_80_with_gpt_reasonings.tsv
 Full dataset loaded successfully. Total rows: 115809
Sliced the first 500 rows to create the toy dataset.
--------------------------------------------------
 Success! New toy dataset saved to: toy_train_with_reasoning.tsv
   It contains 500 rows.
--------------------------------------------------
