In [1]:
%cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [2]:
import json

def read_json(filepath):
    """Reads a JSON file and returns the data."""
    with open(filepath, 'r') as f:
        return json.load(f)

def write_json(data, filepath):
    """Writes data to a JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)

def read_text(filepath):
    """Reads a text file and returns the content."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def write_text(content, filepath):
    """Writes content to a text file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

In [3]:
import os

corpus_folder = 'Dataset/Corpus'

parsed_maha = read_json(os.path.join(corpus_folder, 'parsed.json'))

In [4]:
len(parsed_maha)

72711

In [5]:
all_entities = read_json(os.path.join(corpus_folder, 'mahanama/data/kb/knowledge_base.json'))

In [None]:
all_entities["e11773"]

{'key': 'veda',
 'description': "Veda, mostly pl. ( °āḥ, the sacred knowledge, esp, the Vedic literature, ( °ārthaiḥ, ( caturbhiḥ, ( vyasya Vedaṃ, sc. Vyāsa, ( sāṅgopaniṣadāṃ, ( °ādhyātmaṃ, ( sarva- °vidāṃ varaḥ, ( nikhilān, (sg, ( Kārṣṇaṃ °ṃ.e. the Mhbhr, ( caturaḥ, ( caturbhyaḥ sarahasyebhyaḥ, ( °vidhiḥ, ( loka- °āśra- yeva vāk, ( °ārthaiḥ, ( °viduṣā, (do, ( caturaḥ, † ( °viduṣe, †† ( sarve, †† (do, ( °pāragaḥ, ( °dṛṣṭena karmaṇā, ( °oktena vidhānena, ( °-Vedāṅgavid, ( °-Vedāṅgapāragāt, ( °-Vedāṅgapāragaḥ, ( sarve, ( °-Vedāṅgapāragaḥ, (do, ( sāngān, ( °vidvāṃsaḥ, ( °vidāṃ varaḥ, ( °pāragaḥ, ( °vittamaṃ, ( °vidaḥ, ( °āśrayāḥ kathāḥ, ( sāṅgān setitihāsān, ( °adhyayanena, ( vivyāsaikaṃ caturdhā yo °ṃ °vidāṃ varaḥ, sc. Vyāsa, ( °-Vedāṅgapāragaiḥ, ( °e dhanu — ṣi ca, ( °ādhyayana, ( °aiḥ sammitaṃ, ( Kārṣṇaṃ °ṃ.e. the Mhbhr, ( °ānāṃ pāragaḥ, ( °aiḥ sammitaṃ, ( vivyāsa °ān, sc. Vyāsa, ( °ārthavid, ( sāṅgopaniṣadān, ( °vidaḥ, ( Dhanurvede °e ca; °vidaḥ, ( °, ( °vidaḥ, ( °pāragaiḥ, (quotation

In [6]:
len(all_entities)

12713

In [7]:
all_conllu_files = []

conllu_folder = os.path.join(corpus_folder, 'mahanama/data/mahanama_conllu')
for volume in os.listdir(conllu_folder):
        volume_path = os.path.join(conllu_folder, volume)
        if os.path.isdir(volume_path):
            for fname in os.listdir(volume_path):
                if fname.endswith(".conllu"):
                    fpath = os.path.join(volume_path, fname)
                    all_conllu_files.append(fpath)

len(all_conllu_files)

2110

In [8]:
import os
import re
from collections import defaultdict

version_map = {}

def parse_conllu_entities(path):
    version_map = {}
    entity_pattern = re.compile(r"Entity=\((e\d+)")
    results = defaultdict(list)
    current_sent = None
    current_mapping = None

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("# sent_id"):
                current_mapping = line.split("=")[1].strip()
            if line.startswith("# mnd_reference"):
                current_sent = line.split("=")[1].strip()
                version_map[current_sent] = current_mapping
                results[current_sent]
                if current_sent == 'vol-i_1_22':
                  print('vol-i_1_22')

            elif current_sent and not line.startswith("#") and line:
                cols = line.split("\t")
                if len(cols) >= 10:
                    misc = cols[-1]
                    matches = entity_pattern.findall(misc)
                    for m in matches:
                        results[current_sent].append(m)

    return results, version_map


In [9]:
file_results, version_map = parse_conllu_entities(os.path.join(corpus_folder, 'mahanama/data/mahanama_conllu', 'volume_1/Mbh_Volume_1_Chapter_1_Subchapter_1.conllu'))

vol-i_1_22


In [None]:
file_results

defaultdict(list,
            {'vol-i_1_1': ['e12207', 'e7587', 'e7587', 'e9540', 'e4679'],
             'vol-i_1_2': ['e9817',
              'e9817',
              'e9817',
              'e8267',
              'e7498',
              'e2579',
              'e9817'],
             'vol-i_1_3': ['e7498'],
             'vol-i_1_5': ['e9817'],
             'vol-i_1_7': ['e9817'],
             'vol-i_1_8': ['e9817'],
             'vol-i_1_9': ['e9817',
              'e4777',
              'e9563',
              'e4777',
              'e11327',
              'e6721'],
             'vol-i_1_10': ['e6276'],
             'vol-i_1_11': ['e6239', 'e7969'],
             'vol-i_1_12': ['e1743'],
             'vol-i_1_13': ['e8769'],
             'vol-i_1_14': ['e8769', 'e10362', 'e1782'],
             'vol-i_1_15': ['e11773'],
             'vol-i_1_16': ['e6721',
              'e4679',
              'e1697',
              'e4777',
              'e11327',
              'e11773',
              'e12416

In [10]:
all_results = defaultdict(list)
all_mappings = []

from tqdm import tqdm

for fpath in tqdm(all_conllu_files):
    file_results, version_map = parse_conllu_entities(fpath)

    for sent_id, entities in file_results.items():
        all_results[sent_id].extend(entities)
    all_mappings.append(version_map)

  5%|▌         | 115/2110 [00:03<00:23, 84.24it/s]

vol-i_1_22


100%|██████████| 2110/2110 [12:23<00:00,  2.84it/s]


In [11]:
chapters = read_json(os.path.join(corpus_folder, 'chapters.json'))

verses = {}
for k,c in chapters.items():
  for i,v in enumerate(c.split('\n')):
    verses[f'{k}.{i+1}'] = v

In [12]:
len(verses)

73617

In [None]:
write_json(verses, os.path.join(corpus_folder, 'verses.json'))

In [13]:
len(all_results)

73632

In [None]:
write_json(all_mappings, os.path.join(corpus_folder, 'all_mappings.json'))
write_json(all_entities, os.path.join(corpus_folder, 'entities_kb.json'))

In [14]:
total_len = 0

for ch in all_mappings:
  total_len += len(ch)

total_len

73632

In [15]:
import pandas as pd

map1 = pd.read_csv(os.path.join(corpus_folder, 'mnd_nama_map.csv'))

In [None]:
map1

Unnamed: 0,Volume,MNC_chapter,Parva,Chapter
0,vol-i,1,1,1
1,vol-i,2,1,2
2,vol-i,3,1,3
3,vol-i,4,1,4
4,vol-i,5,1,5
...,...,...,...,...
2105,vol-ix,312,18,2
2106,vol-ix,313,18,3
2107,vol-ix,314,18,4
2108,vol-ix,315,18,5


In [16]:
volume_chapter_map = {}
for index, row in map1.iterrows():
    volume_mnc_chapter = f"{row['Volume']}_{row['MNC_chapter']}"
    parva_chapter = f"{row['Parva']}.{row['Chapter']}"
    volume_chapter_map[volume_mnc_chapter] = parva_chapter


In [17]:
mapped_results = {}

for k,v in all_results.items():
  vol, ch, vid = k.split('_')
  mnc_ch = f'{vol}_{ch}'
  verse_id = f"{volume_chapter_map[mnc_ch]}.{vid}"
  mapped_results[verse_id] = v


In [19]:
len(all_results)

73632

In [20]:
len(mapped_results)

73590

In [21]:
absent = []
for v in verses.keys():
  if v not in list(mapped_results.keys()):
    absent.append(v)
absent

['1.34.11',
 '1.83.14',
 '7.9.20',
 '7.10.49',
 '7.21.42',
 '7.30.22',
 '7.80.48',
 '7.93.1',
 '7.121.13',
 '7.122.52',
 '7.124.11',
 '7.128.13',
 '7.137.38',
 '7.138.12',
 '7.145.16',
 '7.147.71',
 '7.156.147',
 '7.160.11',
 '7.162.50',
 '7.166.1',
 '7.179.1',
 '7.184.29',
 '7.185.1',
 '7.186.11',
 '7.200.1',
 '7.203.2',
 '12.219.8']

In [18]:
len(set(verses.keys()) - set(list(mapped_results.keys())))

27

In [22]:
mapped_results

{'1.157.1': ['e4777', 'e3873', 'e7969'],
 '1.157.2': ['e11327', 'e3873'],
 '1.157.3': [],
 '1.157.4': ['e6200'],
 '1.157.5': ['e1454'],
 '1.157.6': ['e4777'],
 '1.157.7': ['e1454', 'e6200'],
 '1.157.8': ['e6200'],
 '1.157.9': ['e6200', 'e6200', 'e1454', 'e3805'],
 '1.157.10': [],
 '1.157.11': [],
 '1.157.12': [],
 '1.157.13': ['e1454'],
 '1.157.14': ['e11327'],
 '1.157.15': ['e6200', 'e9812'],
 '1.157.16': [],
 '1.157.17': [],
 '1.157.18': [],
 '1.157.19': [],
 '1.157.20': ['e7600'],
 '1.157.21': [],
 '1.157.22': [],
 '1.157.23': [],
 '1.157.24': [],
 '1.157.25': [],
 '1.157.26': [],
 '1.157.27': [],
 '1.157.28': [],
 '1.157.29': ['e1741', 'e8386'],
 '1.157.30': [],
 '1.157.31': [],
 '1.157.32': [],
 '1.157.33': [],
 '1.157.34': [],
 '1.158.1': ['e11362'],
 '1.158.2': [],
 '1.158.3': [],
 '1.158.4': [],
 '1.158.5': [],
 '1.158.6': [],
 '1.158.7': [],
 '1.158.8': [],
 '1.158.9': [],
 '1.158.10': [],
 '1.158.11': [],
 '1.158.12': [],
 '1.158.13': [],
 '1.158.14': ['e11773'],
 '1.158.15':

In [23]:
write_json(mapped_results, os.path.join(corpus_folder, 'entities_verses.json'))

In [38]:
new_mappings = {}
for m in all_mappings:
  for k,v in m.items():
    vol, ch, vid = k.split('_')
    mnc_ch = f'{vol}_{ch}'
    verse_id = f"{volume_chapter_map[mnc_ch]}.{vid}"
    parts = v.split("_")
    result = f"{parts[1]}.{parts[3]}.{parts[4]}"

    new_mappings[verse_id] = result

    flag = 0
    if verse_id != result:
      flag = 1
      print(k, verse_id, result, v, flag)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
vol-iv_53_13 6.52.13 6.53.13 MBh_6_4_53_13
vol-iv_53_14 6.52.14 6.53.14 MBh_6_4_53_14
vol-iv_53_15 6.52.15 6.53.15 MBh_6_4_53_15
vol-iv_53_16 6.52.16 6.53.16 MBh_6_4_53_16
vol-iv_53_17 6.52.17 6.53.17 MBh_6_4_53_17
vol-iv_53_18 6.52.18 6.53.18 MBh_6_4_53_18
vol-iv_53_19 6.52.19 6.53.19 MBh_6_4_53_19
vol-iv_53_20 6.52.20 6.53.20 MBh_6_4_53_20
vol-iv_53_21 6.52.21 6.53.21 MBh_6_4_53_21
vol-iv_53_22 6.52.22 6.53.22 MBh_6_4_53_22
vol-iv_53_23 6.52.23 6.53.23 MBh_6_4_53_23
vol-iv_53_24 6.52.24 6.53.24 MBh_6_4_53_24
vol-iv_53_25 6.52.25 6.53.25 MBh_6_4_53_25
vol-iv_53_26 6.52.26 6.53.26 MBh_6_4_53_26
vol-iv_53_27 6.52.27 6.53.27 MBh_6_4_53_27
vol-iv_53_28 6.52.28 6.53.28 MBh_6_4_53_28
vol-iv_53_29 6.52.29 6.53.29 MBh_6_4_53_29
vol-iv_53_30 6.52.30 6.53.30 MBh_6_4_53_30
vol-iv_53_31 6.52.31 6.53.31 MBh_6_4_53_31
vol-iv_53_32 6.52.32 6.53.32 MBh_6_4_53_32
vol-iv_53_33 6.52.33 6.53.33 MBh_6_4_53_33
vol-iv_53_34 6.52.34 6.53.34 MBh

In [32]:
new_mappings

{'1.157.1': '1.157.1',
 '1.157.2': '1.157.2',
 '1.157.3': '1.157.3',
 '1.157.4': '1.157.4',
 '1.157.5': '1.157.5',
 '1.157.6': '1.157.6',
 '1.157.7': '1.157.7',
 '1.157.8': '1.157.8',
 '1.157.9': '1.157.9',
 '1.157.10': '1.157.10',
 '1.157.11': '1.157.11',
 '1.157.12': '1.157.12',
 '1.157.13': '1.157.13',
 '1.157.14': '1.157.14',
 '1.157.15': '1.157.15',
 '1.157.16': '1.157.16',
 '1.157.17': '1.157.17',
 '1.157.18': '1.157.18',
 '1.157.19': '1.157.19',
 '1.157.20': '1.157.20',
 '1.157.21': '1.157.21',
 '1.157.22': '1.157.22',
 '1.157.23': '1.157.23',
 '1.157.24': '1.157.24',
 '1.157.25': '1.157.25',
 '1.157.26': '1.157.26',
 '1.157.27': '1.157.27',
 '1.157.28': '1.157.28',
 '1.157.29': '1.157.29',
 '1.157.30': '1.157.30',
 '1.157.31': '1.157.31',
 '1.157.32': '1.157.32',
 '1.157.33': '1.157.33',
 '1.157.34': '1.157.34',
 '1.158.1': '1.158.1',
 '1.158.2': '1.158.2',
 '1.158.3': '1.158.3',
 '1.158.4': '1.158.4',
 '1.158.5': '1.158.5',
 '1.158.6': '1.158.6',
 '1.158.7': '1.158.7',
 '1.158

In [40]:
import pandas as pd

new_mappings = {}
rows = []

for m in all_mappings:
    for k, v in m.items():
        vol, ch, vid = k.split('_')
        mnc_ch = f'{vol}_{ch}'
        verse_id = f"{volume_chapter_map[mnc_ch]}.{vid}"
        parts = v.split("_")
        result = f"{parts[1]}.{parts[3]}.{parts[4]}"

        new_mappings[verse_id] = result

        flag = 0
        if verse_id != result:
            flag = 1

        rows.append({
            "key": k,
            "verse_id": verse_id,
            "result": result,
            "original_value": v,
            "flag": flag
        })

# Create DataFrame and save as CSV
df = pd.DataFrame(rows)
df.to_csv("mismatched_mappings.csv", index=False)
print("Saved mismatched mappings to mismatched_mappings.csv")


Saved mismatched mappings to mismatched_mappings.csv


In [51]:
len(set(verses.keys()) - set(parsed_maha_dict.keys()))

21153

In [49]:
parsed_maha_dict = {}
for p in parsed_maha:
  parsed_maha_dict[p['id'].replace('M.', '')] = p['words']

In [50]:
parsed_maha_dict

{'1.1.0': [{'word': 'nArAyaRam',
   'lemma': 'nArAyaRa',
   'features': {'pos': 'n', 'g': 'm', 'c': '2', 'n': 's'}},
  {'word': 'namaskftya', 'lemma': 'namaskf', 'features': {'pos': 'vi'}},
  {'word': 'naram',
   'lemma': 'nara',
   'features': {'pos': 'n', 'g': 'm', 'c': '2', 'n': 's'}},
  {'word': 'ca', 'lemma': 'ca', 'features': {'pos': 'i'}},
  {'word': 'eva', 'lemma': 'eva', 'features': {'pos': 'i'}},
  {'word': 'narottamam',
   'lemma': 'narottama',
   'features': {'pos': 'n', 'g': 'm', 'c': '2', 'n': 's'}},
  {'word': 'devIm',
   'lemma': 'devI',
   'features': {'pos': 'n', 'g': 'f', 'c': '2', 'n': 's'}},
  {'word': 'sarasvatIm',
   'lemma': 'sarasvatI',
   'features': {'pos': 'n', 'g': 'f', 'c': '2', 'n': 's'}},
  {'word': 'ca', 'lemma': 'ca', 'features': {'pos': 'i'}},
  {'word': 'eva', 'lemma': 'eva', 'features': {'pos': 'i'}},
  {'word': 'tato', 'lemma': 'tatas', 'features': {'pos': 'i'}},
  {'word': 'jayam',
   'lemma': 'jaya',
   'features': {'pos': 'n', 'g': 'm', 'c': '2'

In [67]:
parsed_joined_verses = {}

for k,v in parsed_maha_dict.items():
    verse_joined = ""
    for _word in v:
      word = _word['word'] + " "
      verse_joined += word
    parsed_joined_verses[k] = verse_joined

In [55]:
import re
from collections import defaultdict

def parse_conllu_entities(path):
    version_map = {}
    entity_pattern = re.compile(r"Entity=\((e\d+)")
    results = defaultdict(list)
    text_map = {}  # <-- new dict: mnd_reference -> # text value
    current_sent = None
    current_mapping = None

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("# sent_id"):
                current_mapping = line.split("=", 1)[1].strip()
            elif line.startswith("# mnd_reference"):
                current_sent = line.split("=", 1)[1].strip()
                version_map[current_sent] = current_mapping
                results[current_sent]  # initialize
                if current_sent == 'vol-i_1_22':
                    print('vol-i_1_22')

            elif line.startswith("# text") and current_sent:
                # capture the full text of the sentence
                text_value = line.split("=", 1)[1].strip()
                text_map[current_sent] = text_value

            elif current_sent and not line.startswith("#") and line:
                cols = line.split("\t")
                if len(cols) >= 10:
                    misc = cols[-1]
                    matches = entity_pattern.findall(misc)
                    for m in matches:
                        results[current_sent].append(m)

    return results, version_map, text_map


In [60]:
all_results = defaultdict(list)
all_mappings = []
all_text = {}
from tqdm import tqdm

for fpath in tqdm(all_conllu_files):
    file_results, version_map, text_map= parse_conllu_entities(fpath)

    for sent_id, entities in file_results.items():
        all_results[sent_id].extend(entities)

    all_text.update(text_map)
    all_mappings.append(version_map)

  5%|▌         | 109/2110 [00:00<00:11, 176.80it/s]

vol-i_1_22


100%|██████████| 2110/2110 [00:11<00:00, 190.97it/s]


In [61]:
all_text

{'vol-i_157_1': 'janamejaya uvAca ekacakrAM gatAste tu kuntIputrA mahAraTAH . ata UrDvaM dvijaSrezWa kimakurvata pARqavAH ..',
 'vol-i_157_2': 'vESampAyana uvAca ekacakrAM gatAste tu kuntIputrA mahAraTAH . UzurnAticiraM kAlaM brAhmaRasya niveSane ..',
 'vol-i_157_3': 'ramaRIyAni paSyanto vanAni viviDAni ca . pArTivAnapi coddeSAn saritaSca sarAMsi ca .. cesarBekzaM tadA te tu sarva eva viSAmpate . baBUvurnAgarARAM ca svErguREH priyadarSanAH ..',
 'vol-i_157_4': 'nivedayanti sma tadA kuntyA BEkzaM sadA niSi . tayA viBaktAn BAgAMste BuYjate sma pfTak pfTak ..',
 'vol-i_157_5': 'arDaM te BuYjate vIrAH saha mAtrA paraMtapAH . arDaM sarvasya BEkzasya BImo BuNkte mahAbalaH ..',
 'vol-i_157_6': "taTA tu tezAM vasatAM tasmin rAzwre mahAtmanAm . aticakrAma sumahAn kAlo'Ta BaratarzaBa ..",
 'vol-i_157_7': 'tataH kadAcid BEkzAya gatAste puruzarzaBAH . saMgatyA BImasenastu tatrAste pfTayA saha ..',
 'vol-i_157_8': 'aTArtijaM mahASabdaM brAhmaRasya niveSane . BfSamutpatitaM GoraM kuntI SuSrAva BArat

In [64]:
all_text_mapped = {}
for k, v in all_text.items():
    vol, ch, vid = k.split('_')
    mnc_ch = f'{vol}_{ch}'
    verse_id = f"{volume_chapter_map[mnc_ch]}.{vid}"

    all_text_mapped[verse_id] = v


In [65]:
all_text_mapped

{'1.157.1': 'janamejaya uvAca ekacakrAM gatAste tu kuntIputrA mahAraTAH . ata UrDvaM dvijaSrezWa kimakurvata pARqavAH ..',
 '1.157.2': 'vESampAyana uvAca ekacakrAM gatAste tu kuntIputrA mahAraTAH . UzurnAticiraM kAlaM brAhmaRasya niveSane ..',
 '1.157.3': 'ramaRIyAni paSyanto vanAni viviDAni ca . pArTivAnapi coddeSAn saritaSca sarAMsi ca .. cesarBekzaM tadA te tu sarva eva viSAmpate . baBUvurnAgarARAM ca svErguREH priyadarSanAH ..',
 '1.157.4': 'nivedayanti sma tadA kuntyA BEkzaM sadA niSi . tayA viBaktAn BAgAMste BuYjate sma pfTak pfTak ..',
 '1.157.5': 'arDaM te BuYjate vIrAH saha mAtrA paraMtapAH . arDaM sarvasya BEkzasya BImo BuNkte mahAbalaH ..',
 '1.157.6': "taTA tu tezAM vasatAM tasmin rAzwre mahAtmanAm . aticakrAma sumahAn kAlo'Ta BaratarzaBa ..",
 '1.157.7': 'tataH kadAcid BEkzAya gatAste puruzarzaBAH . saMgatyA BImasenastu tatrAste pfTayA saha ..',
 '1.157.8': 'aTArtijaM mahASabdaM brAhmaRasya niveSane . BfSamutpatitaM GoraM kuntI SuSrAva BArata ..',
 '1.157.9': 'rorUyamARAMs

In [68]:
parsed_joined_verses

{'1.1.0': 'nArAyaRam namaskftya naram ca eva narottamam devIm sarasvatIm ca eva tato jayam udIrayet ',
 '1.1.1': 'lomaharzaRa putraH ugraSravAH sUtaH pOrARiko nEmiza araRye SOnakasya kula patyuH dvAdaSa vArzike sattre ',
 '1.1.2': 'samAsInAn aByagacCad brahmarzIn saMSita vratAn vinaya avanataH BUtvA kadAcit sUta nandanaH ',
 '1.1.3': 'tam ASramam anuprAptam nEmiza araRya vAsinaH citrAH Srotum kaTAs tatra parivavrus tapasvinaH ',
 '1.1.4': 'aBivAdya munIMs tAMs tu sarvAn eva kftAYjaliH apfcCat sa tapaH vfdDim as ca eva aBinanditaH ',
 '1.1.5': 'aTa tezu upavizwezu sarvezu eva tapasvizu nirdizwam Asanam Beje vinayAt lomaharzaRaH ',
 '1.1.6': 'suKa AsInam tatas tam tu viSrAntam upalakzya ca aTa apfcCat fzis tatra kaScit prastAvayan kaTAH ',
 '1.1.7': 'kuta Agamyate sOte kva ca ayam vihftas tvayA kAlaH kamala pattra akza SaMsa etat pfcCato mama ',
 '1.1.8': 'sUta uvAca janamejayasya rAjarzeH sarpasattre mahAtmanaH samIpe pArTiva indrasya samyak pArikzitasya ca ',
 '1.1.9': 'kfzRadvEpAyana 

In [114]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import Levenshtein  # pip install python-Levenshtein

def align_dicts(verses_A, verses_B, window=10, threshold=0.5):
    """
    Align two verse dictionaries using Levenshtein similarity ratio
    with a sequential sliding window.
    If ratio > threshold (default 0.6), verses are considered matched.
    """

    # Convert dicts to ordered lists
    A_items = list(verses_A.items())
    B_items = list(verses_B.items())

    A_ids, A_texts = zip(*A_items)
    B_ids, B_texts = zip(*B_items)

    results = []
    current_b_idx = 0

    for i in tqdm(range(len(A_texts)), desc="Aligning verses (Levenshtein)"):
        best_j = None
        best_sim = -1

        # Define sliding window in B
        start = max(0, current_b_idx - window)
        end = min(len(B_texts), current_b_idx + window)

        # Compare verse_A[i] to each verse_B[j] within window
        for j in range(start, end):
            sim = Levenshtein.ratio(A_texts[i], B_texts[j])
            if sim > best_sim:
                best_sim = sim
                best_j = j

        # Record match or no-match
        if best_sim >= threshold:
            results.append({
                "A_id": A_ids[i],
                "A_text": A_texts[i],
                "checked_range": f"{start} , {end}",
                "B_id": B_ids[best_j],
                "B_text": B_texts[best_j],
                "similarity": best_sim,
                "status": "matched"
            })
        else:
            results.append({
                "A_id": A_ids[i],
                "A_text": A_texts[i],
                "checked_range": f"{start} , {end}",
                "B_id": None,
                "B_text": None,
                "similarity": best_sim,
                "status": "unmatched"
            })
        current_b_idx = current_b_idx + 1
    return pd.DataFrame(results)


In [75]:
def sort_verses_dict(verse_dict):
    """
    Sorts a dict whose keys are 'vol.ch.verse' like '12.284.20'
    numerically by each part.
    """
    def parse_key(key):
        # Split on dots and convert each part to int safely
        return tuple(int(x) for x in key.split('.'))

    sorted_items = sorted(verse_dict.items(), key=lambda x: parse_key(x[0]))
    return dict(sorted_items)



In [76]:
all_text_mapped_sorted = sort_verses_dict(all_text_mapped)
parsed_joined_verses_sorted = sort_verses_dict(parsed_joined_verses)

In [78]:
parsed_joined_verses_sorted

{'1.1.0': 'nArAyaRam namaskftya naram ca eva narottamam devIm sarasvatIm ca eva tato jayam udIrayet ',
 '1.1.1': 'lomaharzaRa putraH ugraSravAH sUtaH pOrARiko nEmiza araRye SOnakasya kula patyuH dvAdaSa vArzike sattre ',
 '1.1.2': 'samAsInAn aByagacCad brahmarzIn saMSita vratAn vinaya avanataH BUtvA kadAcit sUta nandanaH ',
 '1.1.3': 'tam ASramam anuprAptam nEmiza araRya vAsinaH citrAH Srotum kaTAs tatra parivavrus tapasvinaH ',
 '1.1.4': 'aBivAdya munIMs tAMs tu sarvAn eva kftAYjaliH apfcCat sa tapaH vfdDim as ca eva aBinanditaH ',
 '1.1.5': 'aTa tezu upavizwezu sarvezu eva tapasvizu nirdizwam Asanam Beje vinayAt lomaharzaRaH ',
 '1.1.6': 'suKa AsInam tatas tam tu viSrAntam upalakzya ca aTa apfcCat fzis tatra kaScit prastAvayan kaTAH ',
 '1.1.7': 'kuta Agamyate sOte kva ca ayam vihftas tvayA kAlaH kamala pattra akza SaMsa etat pfcCato mama ',
 '1.1.8': 'sUta uvAca janamejayasya rAjarzeH sarpasattre mahAtmanaH samIpe pArTiva indrasya samyak pArikzitasya ca ',
 '1.1.9': 'kfzRadvEpAyana 

In [115]:
df = align_dicts(all_text_mapped_sorted, parsed_joined_verses_sorted, window=50, threshold=0.5)
df.to_csv("Temp/alignment_5.csv", index=False)
num_unmatched = (df["status"] == "unmatched").sum()
print(f"Unmatched verses: {num_unmatched}")


Aligning verses (Levenshtein): 100%|██████████| 73590/73590 [00:09<00:00, 8025.04it/s]


Unmatched verses: 67838


In [90]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected package

In [86]:
import math
from collections import Counter

def cosine_characters(v1, v2):
    c1, c2 = Counter(v1), Counter(v2)
    chars = set(c1) | set(c2)
    dot = sum(c1[ch]*c2[ch] for ch in chars)
    mag1 = math.sqrt(sum(v**2 for v in c1.values()))
    mag2 = math.sqrt(sum(v**2 for v in c2.values()))
    return dot / (mag1*mag2) if mag1 and mag2 else 0.0


In [96]:
import Levenshtein
print(Levenshtein.ratio(all_text_mapped_sorted['1.1.3'], parsed_joined_verses_sorted['1.1.5']))


0.33720930232558144


In [100]:
# Count unmatched
num_unmatched = (df["status"] == "unmatched").sum()
print(f"Unmatched verses: {num_unmatched}")


Unmatched verses: 73573


In [110]:
import pandas as pd
from collections import Counter

def count_verses_per_chapter(keys, output_csv="chapter_counts.csv"):
    """
    Given a list of keys like '1.2.3' (vol.ch.verse),
    count how many verses each chapter has and save to CSV.
    """
    chapter_counts = Counter()

    for k in keys:
        try:
            vol, ch, verse = k.split(".")
            chapter_key = f"{vol}.{ch}"
            chapter_counts[chapter_key] += 1
        except ValueError:
            # Skip malformed keys
            continue

    df = pd.DataFrame([
        {"Volume_Chapter": k, "Verse_Count": v}
        for k, v in sorted(chapter_counts.items(),
                           key=lambda x: [int(i) for i in x[0].split(".")])
    ])

    df.to_csv(output_csv, index=False)
    print(f"Saved chapter counts to {output_csv}")
    return df


In [112]:
df_1 = count_verses_per_chapter(list(all_text_mapped_sorted.keys()), output_csv = "Temp/mndutt_chapters.csv")
df_2 = count_verses_per_chapter(list(parsed_joined_verses_sorted.keys()), output_csv = "Temp/parse_chapters.csv")

Saved chapter counts to Temp/mndutt_chapters.csv
Saved chapter counts to Temp/parse_chapters.csv


In [113]:
df_1.shape, df_2.shape

((2108, 2), (1976, 2))

In [117]:
import re

def parse_conllu_to_dict(filepath):
    """
    Parse a .conllu file and return a dict:
    { mnd_ref: {"entities": [...], "speaker": "..." } }
    """
    result = {}
    current_ref = None
    current_entities = set()
    current_speaker = ""

    with open(filepath, encoding="utf-8") as f:
        lines = f.readlines()

    for i, line in enumerate(lines):
        line = line.strip()

        # Start of new verse block
        if line.startswith("# sent_id"):
            # Save previous verse (if any)
            if current_ref:
                result[current_ref] = {
                    "entities": sorted(current_entities),
                    "speaker": current_speaker
                }

            # Reset for next verse
            current_ref = None
            current_entities = set()
            current_speaker = ""

        # Extract mnd_reference
        elif line.startswith("# mnd_reference"):
            current_ref = line.split("=", 1)[1].strip()

        # Parse token lines
        elif re.match(r"^\d+\t", line):
            cols = line.split("\t")
            token_id = cols[0]
            token = cols[1]
            misc = cols[-1] if len(cols) > 8 else ""

            # Extract entity info
            if "Entity=" in misc:
                ent_matches = re.findall(r"Entity=\(([^)]+)\)", misc)
                for e in ent_matches:
                    # keep only the entity ID part
                    entity_id = e.split("|")[0] if "|" in e else e
                    current_entities.add(entity_id.strip())

            # Detect "uvAca" and capture first token as speaker
            if token == "uvAca":
                # search for line with token_id == 1
                for back in range(i - 1, -1, -1):
                    prev_line = lines[back].strip()
                    if re.match(r"^1\t", prev_line):
                        first_cols = prev_line.split("\t")
                        current_speaker = first_cols[1]
                        break

    # Append last verse
    if current_ref:
        result[current_ref] = {
            "entities": sorted(current_entities),
            "speaker": current_speaker
        }

    return result


In [118]:
all_text = {}
from tqdm import tqdm

for fpath in tqdm(all_conllu_files):
    result = parse_conllu_to_dict(fpath)

    all_text.update(result)


100%|██████████| 2110/2110 [00:15<00:00, 136.83it/s]


In [119]:
all_text_mapped = {}
for k, v in all_text.items():
    vol, ch, vid = k.split('_')
    mnc_ch = f'{vol}_{ch}'
    verse_id = f"{volume_chapter_map[mnc_ch]}.{vid}"

    all_text_mapped[verse_id] = v


In [121]:
all_text_mapped_sorted = sort_verses_dict(all_text_mapped)
write_json(all_text_mapped_sorted, os.path.join(corpus_folder, 'entities_speakers_verses.json'))