In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/english-data/combined.eng_Latn
/kaggle/input/mapping2/2024-07-21.154041.shivani.A3.final


In [2]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.8.2-py3-none-any.whl.metadata (13 kB)
Downloading stanza-1.8.2-py3-none-any.whl (990 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.1/990.1 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: stanza
Successfully installed stanza-1.8.2


In [3]:
import stanza

# Initialize the NLP pipeline
nlp = stanza.Pipeline('en', processors='tokenize,ner')

# File paths
input_file_path = '/kaggle/input/english-data/combined.eng_Latn'
output_file_path = 'named_entities.txt'

# Set to keep track of seen entities
seen_entities = set()

# Open the input file and the output file
with open(input_file_path, 'r', encoding='utf-8') as input_file, open(output_file_path, 'w', encoding='utf-8') as output_file:
    for line_number, line in enumerate(input_file, start=1):
        doc = nlp(line.strip())
        
        for sentence in doc.sentences:
            for entity in sentence.ents:
                entity_key = (entity.text, line_number)  # Unique key based on entity text and line number
                
                if entity_key not in seen_entities:
                    seen_entities.add(entity_key)
                    output_file.write(f"Entity: {entity.text}, Line: {line_number}\n")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/mwt/combined.pt:   0%|         …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/ner/ontonotes-ww-multi_charlm.p…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/backward_charlm/1billion.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/pretrain/conll17.pt:   0%|     …

KeyboardInterrupt: 

In [6]:
with open('named_entities.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

unique_entities = set()
for line in lines:
    unique_entities.add(line.strip())

with open('named_entities.txt', 'w', encoding='utf-8') as file:
    for entity in unique_entities:
        file.write(f"{entity}\n")


In [54]:
import re

def parse_named_entities(filename):
    entities = []
    with open(filename, 'r') as file:
        for line in file:
            match = re.match(r"Entity: (.*?), Type: (.*?)\n", line)
            if match:
                entities.append((match.group(1), match.group(2)))
    return entities

def parse_alignment_file(filename):
    sentences = []
    current_santali = None
    current_english = None
    
    with open(filename, 'r') as file:
        for line in file:
            # Check for Santali sentence (change this condition if needed)
            if not line.strip().startswith('NULL') and not line.strip().startswith('({'):
                current_santali = line.strip()
            elif 'NULL' in line or '({ })' in line:
                # Ensure that both Santali and English are available
                if current_santali:
                    current_english = line.strip()
                    sentences.append((current_santali, current_english))
                    current_santali = None
    return sentences

def get_santali_translation(entity, sentences):
    for santali, english in sentences:
        if entity in english:
            indices = re.findall(r'\{(.*?)\}', english)
            words = re.split(r'\s+', santali)
            try:
                santali_words = []
                for index in indices:
                    if index:
                        santali_words.extend(words[int(i)] for i in index.split() if int(i) < len(words))
                return ' '.join(santali_words)
            except Exception as e:
                print(f"Error processing entity '{entity}': {e}")
                print(f"Entity: {entity}")
                print(f"English: {english}")
                print(f"Indices: {indices}")
                print(f"Words: {words}")
    return None

def main(named_entities_file, alignment_file, output_file):
    entities = parse_named_entities(named_entities_file)
    sentences = parse_alignment_file(alignment_file)
    
    with open(output_file, 'w') as file:
        for entity, _ in entities:
            translation = get_santali_translation(entity, sentences)
            if translation:
                file.write(f"Entity: {entity}, Santali: {translation}\n")
            else:
                file.write(f"Entity: {entity}, Santali: Not found\n")

# Example usage
main('named_entities.txt', '/kaggle/input/mapping2/2024-07-21.154041.shivani.A3.final', 'my_translations.txt')
