In [None]:
import pandas as pd
import re
import openpyxl
import unicodedata
from tabulate import tabulate

def read_alignment_file(file_path):
    alignments = {}
    print(f"Opening alignment file: {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_num, line in enumerate(file, 1):
                print(f"Processing line {line_num}: {line.strip()}")
                parts = line.strip().split('|||')
                if len(parts) == 2:
                    mal, eng_with_id = parts
                    mal = mal.strip()
                    eng_parts = eng_with_id.split('(')
                    eng = eng_parts[0].strip()
                    audio_id = eng_parts[1].strip()[:-1] if len(eng_parts) > 1 else None
                    if audio_id not in alignments:
                        alignments[audio_id] = []
                    alignments[audio_id].append((mal, eng))
                    print(f"Added alignment for {audio_id}: '{mal}' <-> '{eng}'")
                else:
                    print(f"Skipping line {line_num}: Invalid format")
        print(f"Loaded alignments for {len(alignments)} audio IDs")
    except Exception as e:
        print(f"Error reading alignment file: {e}")
    return alignments

def read_excel_file(file_path, sheet_name):
    print(f"Opening Excel file: {file_path}, Sheet: {sheet_name}")
    try:
        wb = openpyxl.load_workbook(file_path)
        sheet = wb[sheet_name]
        print(f"All sheet names: {wb.sheetnames}")
        
        first_row = next(sheet.iter_rows())
        columns = [cell.value for cell in first_row]
        print(f"Columns found: {columns}")
        
        data = []
        for row in sheet.iter_rows(min_row=2):
            row_data = {columns[i]: cell.value for i, cell in enumerate(row)}
            data.append(row_data)
        
        df = pd.DataFrame(data)
        print(f"Loaded Excel file. Shape: {df.shape}")
        print("First few rows of the DataFrame:")
        print(df.head())
        return df
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return None

import unicodedata
import re

def custom_normalize(word):
    # Replace the sequence ര്‍ (ര + ് + ZWJ) with ർ
    return re.sub(r'ര്\u200d', 'ർ', word)

def split_words(word):
    if word:
        # Apply custom normalization
        normalized_word = custom_normalize(word)
        # Then apply NFKC normalization
        normalized_word = unicodedata.normalize('NFKC', normalized_word)
        split = re.split(r'[/ ]', normalized_word)
        return split
    return []

def get_alignment_status(mal_word, eng_word, audio_id, alignments):
    print(f"\nChecking alignment for: Malayalam '{mal_word}', English '{eng_word}', Audio ID '{audio_id}'")
    
    audio_id_part = audio_id.split('_')[1] if audio_id and '_' in audio_id else None
    print(f"Extracted Audio ID part: {audio_id_part}")
    
    if audio_id_part not in alignments:
        print(f"No alignments found for Audio ID part {audio_id_part}. Treating as 'non_one_to_one'.")
        return ['non_one_to_one']
    
    relevant_alignments = alignments[audio_id_part]
    print(f"Relevant alignments for {audio_id_part}: {relevant_alignments}")
    
    mal_words = split_words(mal_word) if mal_word else []
    eng_words = split_words(eng_word) if eng_word else []
    
    print(f"Split Malayalam words: {mal_words}")
    print(f"Split English words: {eng_words}")
    
    statuses = []

    for mal, eng in relevant_alignments:
        mal_match = any(word.lower() == mal.lower() for word in mal_words)
        eng_match = any(word.lower() == eng.lower() for word in eng_words)
        
        if mal_match and eng_match:
            statuses.append('1 1')
        elif mal_match and not eng_match:
            statuses.append('1 0')
        elif not mal_match and eng_match:
            statuses.append('0 1')
        else:
            statuses.append('0 0')

    # Check for words in mal_words and eng_words that are not in alignments
    alignment_mal = [mal for mal, _ in relevant_alignments]
    alignment_eng = [eng for _, eng in relevant_alignments]
    
    for word in mal_words:
        if word.lower() not in [w.lower() for w in alignment_mal]:
            statuses.append('non_one_to_one')
    
    for word in eng_words:
        if word.lower() not in [w.lower() for w in alignment_eng]:
            statuses.append('non_one_to_one')

    print(f"Statuses: {statuses}")
    return statuses

def main():
    excel_file = 'Real M real.xlsx'
    sheet_name = 'ENGLISH-MALAYALAM-CONSISTENCY'
    df = read_excel_file(excel_file, sheet_name)
    if df is None:
        return

    alignment_file = 'one-one.txt'
    alignments = read_alignment_file(alignment_file)

    counters = {'1 1': 0, '0 0': 0, '1 0': 0, '0 1': 0, 'non_one_to_one': 0}

    for idx, row in df.iterrows():
        try:
            print(f"\nProcessing row {idx}:")
            audio_id = row['Audio_ID']
            mal_word = row['Malayalam_Emphasized_Word']
            eng_word = row['English_Emphasized_Word']
            print(f"Audio ID: '{audio_id}', Malayalam word: '{mal_word}', English word: '{eng_word}'")
            statuses = get_alignment_status(mal_word, eng_word, audio_id, alignments)
            for status in statuses:
                counters[status] += 1
            print(f"Statuses: {statuses}, Updated counters: {counters}")
        except Exception as e:
            print(f"Error processing row {idx}: {e}")

    print("\nFinal Results:")
    
    # Prepare data for the table
    table_data = [
        ["Emphasized", "Emphasized", counters['1 1']],
        ["Not Emphasized", "Emphasized", counters['0 1']],
        ["Emphasized", "Not Emphasized", counters['1 0']],
        ["Not Emphasized", "Not Emphasized", counters['0 0']]
    ]

    # Create and print the table
    headers = ["Malayalam", "English", "Count"]
    table = tabulate(table_data, headers, tablefmt="grid")
    print(table)

    # Print non-one-to-one alignment words count
    print(f"\nNon one-to-one alignment words: {counters['non_one_to_one']}")

if __name__ == "__main__":
    main()