In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/my-input/2024-07-21.154041.shivani.A3.final
/kaggle/input/my-input/named_entities.txt


In [3]:
import re

def filter_single_word_entities(input_filename):
    """Filter out multi-word entities and return a list of single-word entities."""
    single_word_entities = []
    with open(input_filename, 'r') as file:
        for line in file:
            match = re.match(r"Entity: (.*?), Type: (.*?)\n", line)
            if match:
                entity, _ = match.groups()
                if len(entity.split()) == 1:  # Single-word entities only
                    single_word_entities.append(entity)
    return single_word_entities

def parse_alignment_file(filename):
    """Parse the alignment file to extract Santali and English sentences along with their alignment indices."""
    sentences = []
    current_santali = None
    current_english = None
    
    with open(filename, 'r') as file:
        for line in file:
            if not line.strip().startswith('NULL') and not line.strip().startswith('({'):
                current_santali = line.strip()
            elif 'NULL' in line or re.search(r'\(\{.*?\}\)', line):
                if current_santali:
                    current_english = line.strip()
                    sentences.append((current_santali, current_english))
                    current_santali = None
    return sentences

def get_santali_translation(entity, sentences):
    """Get the Santali translation for a given entity using alignment indices."""
    for santali, english in sentences:
        if entity in english:
            # Extract alignment indices for the entity
            pattern = re.escape(entity) + r' \(\{(.*?)\}\)'
            match = re.search(pattern, english)
            if match:
                indices = match.group(1).split()
                words = santali.split()
                try:
                    santali_words = [words[int(i) - 1] for i in indices if int(i) - 1 < len(words)]
                    return ' '.join(santali_words)
                except Exception as e:
                    print(f"Error processing entity '{entity}': {e}")
                    print(f"Entity: {entity}")
                    print(f"English: {english}")
                    print(f"Indices: {indices}")
                    print(f"Santali Words: {words}")
    return None

def main(named_entities_file, alignment_file, output_file):
    """Main function to process and translate named entities."""
    entities = filter_single_word_entities(named_entities_file)
    sentences = parse_alignment_file(alignment_file)
    
    with open(output_file, 'w') as out_file:
        for entity in entities:
            translation = get_santali_translation(entity, sentences)
            if translation:
                out_file.write(f"Entity: {entity}, Santali: {translation}\n")

# Example usage
main('/kaggle/input/my-input/named_entities.txt', '/kaggle/input/my-input/2024-07-21.154041.shivani.A3.final', 'my_translations.txt')


In [5]:
import pandas as pd

# Read the contents of the text file
file_path = '/kaggle/working/my_translations.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Process the lines to extract English and Santali translations
data = []
for line in lines:
    if line.strip():  # Only process non-empty lines
        parts = line.split(', Santali:')
        if len(parts) == 2:
            english = parts[0].replace('Entity:', '').strip()
            santali = parts[1].strip()
            data.append((english, santali))

# Create a DataFrame and write it to a CSV file
df = pd.DataFrame(data, columns=['English', 'Santali'])
csv_file_path = 'my_translations.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-8')


In [6]:
# Read the contents of the text file
file_path = '/kaggle/working/my_translations.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Process the lines to extract English and Santali translations
english_words = []
santali_words = []
for line in lines:
    if line.strip():  # Only process non-empty lines
        parts = line.split(', Santali:')
        if len(parts) == 2:
            english = parts[0].replace('Entity:', '').strip()
            santali = parts[1].strip()
            english_words.append(english)
            santali_words.append(santali)

# Write English words to a file
english_file_path = 'english_words.txt'
with open(english_file_path, 'w', encoding='utf-8') as file:
    for word in english_words:
        file.write(word + '\n')

# Write Santali words to a file
santali_file_path = 'santali_words.txt'
with open(santali_file_path, 'w', encoding='utf-8') as file:
    for word in santali_words:
        file.write(word + '\n')

english_file_path, santali_file_path


('english_words.txt', 'santali_words.txt')