In [1]:
cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [2]:
import json

with open('Dataset/Validation/mahabharata_questions.json', 'r') as f:
    mahabharata_questions = json.load(f)

with open('Dataset/Corpus/entities_speakers_verses.json', 'r') as f:
    entities_speakers_verses = json.load(f)

with open('Dataset/Corpus/verses.json', 'r') as f:
    verses = json.load(f)

with open('Dataset/Corpus/chapters.json', 'r') as f:
    chapters = json.load(f)

with open('Dataset/Corpus/entities_kb.json', 'r') as f:
    entities_kb = json.load(f)

### Normalize script

In [6]:
pip install indic-transliteration

Collecting indic-transliteration
  Downloading indic_transliteration-2.3.75-py3-none-any.whl.metadata (1.4 kB)
Collecting backports.functools-lru-cache (from indic-transliteration)
  Downloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting roman (from indic-transliteration)
  Downloading roman-5.1-py3-none-any.whl.metadata (4.2 kB)
Downloading indic_transliteration-2.3.75-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.6/159.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl (6.7 kB)
Downloading roman-5.1-py3-none-any.whl (5.8 kB)
Installing collected packages: roman, backports.functools-lru-cache, indic-transliteration
Successfully installed backports.functools-lru-cache-2.0.0 indic-transliteration-2.3.75 roman-5.1


In [4]:
def build_cluster_map(kb):
    """
    Build a mapping from every entity ID to its cluster head entity ID.
    """
    cluster_map = {}
    alias_to_head = {}

    # 1️⃣ Build alias → head lookup
    for eid, entry in kb.items():
        if entry.get("cluster_head"):
            for alias_id in entry.get("aliases", []):
                alias_to_head[alias_id] = eid

    # 2️⃣ Assign each entity to its head
    for eid, entry in kb.items():
        if entry.get("cluster_head"):
            cluster_map[eid] = eid  # self-map
        elif eid in alias_to_head:
            cluster_map[eid] = alias_to_head[eid]
        else:
            cluster_map[eid] = None  # unclustered (optional)

    return cluster_map


In [5]:
entity_map = build_cluster_map(entities_kb)


In [6]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from collections import defaultdict

# Example input
# Create chapter-wise aggregator
chapter_data = defaultdict(lambda: {"entities": set(), "speakers": set()})

for key, val in entities_speakers_verses.items():
    vol, chap, verse = key.split('.')
    chap_key = f"{vol}.{chap}"

    # Add entities
    for e in val.get("entities", []):
        # Extract name part after '--'
        parts = e.split('--')
        if len(parts) == 2:
            eid, name = parts
            name_iast = transliterate(name, sanscript.SLP1, sanscript.ITRANS).lower()
            chapter_data[chap_key]["entities"].add(f"{eid}--{name_iast}")
        else:
            chapter_data[chap_key]["entities"].add(e)

    # Add speaker if not empty
    speaker = val.get("speaker", "").strip()
    if speaker:
        speaker_iast = transliterate(speaker, sanscript.SLP1, sanscript.ITRANS)
        chapter_data[chap_key]["speakers"].add(speaker_iast)

# Convert sets to sorted lists
for chap_key in chapter_data:
    chapter_data[chap_key]["entities"] = sorted(chapter_data[chap_key]["entities"])
    chapter_data[chap_key]["speakers"] = sorted(chapter_data[chap_key]["speakers"])

# Result
chapter_data = dict(chapter_data)

In [None]:
chapter_data

{'1.1': {'entities': ['e10091-person--subala',
   'e10104-person--subhadra',
   'e10113-person--subhraj',
   'e10216-person--suhotra',
   'e10242-person--sukratu',
   'e1025-person--avikshit',
   'e10351-person--supratika',
   'e10362-person--sura',
   'e10478-person--surya',
   'e10920-location--tridiva',
   'e11049-person--ushinara',
   'e11092-misc--udyogaparvan',
   'e11176-misc--upanishad',
   'e11255-person--uttara',
   'e11327-person--vaishampayana',
   'e11672-person--vasu',
   'e11755-person--vayu',
   'e11773-misc--veda',
   'e11782-misc--veda~nga',
   'e118-person--ashvatthaman',
   'e11803-person--vena',
   'e1185-person--balhika',
   'e11856-person--vichitravirya',
   'e11959-person--vishvedevah',
   'e11999-person--vidura',
   'e12036-person--vijaya',
   'e12062-person--vikramin',
   'e12167-person--virataparvan',
   'e12207-person--vishnu',
   'e12218-person--vishvagashva',
   'e12237-person--vitihotra',
   'e123-person--ashvin',
   'e12416-person--vyasa',
   'e12459-per

In [7]:
import re

def standardize_entities(chapter_data, cluster_map):
    standardized = {}

    for chap_id, data in chapter_data.items():
        entities = data.get("entities", [])
        speakers = data.get("speakers", [])

        new_entities = []
        for ent in entities:
            m = re.match(r"^(e\d+)(-.+)?$", ent)
            if not m:
                continue
            eid, suffix = m.groups()
            suffix = suffix or ""
            head = cluster_map.get(eid, eid)  # fallback to itself if unknown

            if head:  # only if we have a valid mapping
                new_entities.append(f"{head}{suffix}")

        # deduplicate while preserving order
        seen = set()
        new_entities = [x for x in new_entities if not (x in seen or seen.add(x))]

        standardized[chap_id] = {
            "entities": new_entities,
            "speakers": speakers
        }

    return standardized


In [8]:
standardized = standardize_entities(chapter_data, entity_map)
print(standardized['6.101'])


{'entities': ['e10478-person--surya', 'e1130-person--bala', 'e12384-person--vrritra', 'e12553-person--yama', 'e12635-misc--yuga', 'e13-person--abhimanyu', 'e1335-person--bharata', 'e1497-person--bhishma', 'e2376-person--shambara', 'e3078-person--danava', 'e3508-person--dhrritarashtra', 'e3652-person--draupadeya', 'e3676-person--drona', 'e3805-person--duryodhana', 'e386-person--alambusha', 'e4618-person--indra', 'e4869-person--jayadratha', 'e5903-person--krripa', 'e7257-person--maya', 'e7646-misc--nishachara', 'e7969-person--pandava', 'e801-person--arjuna', 'e8633-person--prativindhya', 'e9460-person--sa~njaya', 'e9973-person--soma'], 'speakers': ['sa~njaya']}


In [None]:
standardized

{'1.1': {'entities': ['e10091-person--subala',
   'e10104-person--subhadra',
   'e10113-person--subhraj',
   'e10216-person--suhotra',
   'e10242-person--sukratu',
   'e1025-person--avikshit',
   'e10351-person--supratika',
   'e10362-person--sura',
   'e10478-person--surya',
   'e10920-location--tridiva',
   'e11049-person--ushinara',
   'e11092-misc--udyogaparvan',
   'e11176-misc--upanishad',
   'e11255-person--uttara',
   'e11327-person--vaishampayana',
   'e11672-person--vasu',
   'e11755-person--vayu',
   'e11773-misc--veda',
   'e11782-misc--veda~nga',
   'e118-person--ashvatthaman',
   'e11803-person--vena',
   'e1185-person--balhika',
   'e11856-person--vichitravirya',
   'e11959-person--vishvedevah',
   'e11999-person--vidura',
   'e12036-person--vijaya',
   'e12062-person--vikramin',
   'e12167-person--virataparvan',
   'e12207-person--vishnu',
   'e12218-person--vishvagashva',
   'e12237-person--vitihotra',
   'e123-person--ashvin',
   'e12416-person--vyasa',
   'e12459-per

In [None]:
with open('Dataset/Corpus/chapter_entities_speakers.json', 'w') as f:
    json.dump(standardized, f, indent=4)

In [3]:
!pip install RapidFuzz

Collecting RapidFuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/3.2 MB[0m [31m13.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m49.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: RapidFuzz
Successfully installed RapidFuzz-3.14.3


In [9]:
from rapidfuzz import fuzz, process

from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Build a map from normalized key text → entity ID

from collections import defaultdict

entity_index = defaultdict(list)
for eid, v in entities_kb.items():
    key = transliterate(v["key"], sanscript.SLP1, sanscript.ITRANS).lower()
    entity_index[key].append(eid)

def extract_query_entities(query_word, entity_index, threshold=90):
    matches = process.extract(query_word, list(entity_index.keys()), scorer=fuzz.token_sort_ratio, limit=5)
    matched_entities = []
    matches_names = []
    scores = []
    for _match, score, _ in matches:
        if score >= threshold:
            matched_entities.append(entity_index[_match])
            matches_names.append(_match)
            scores.append(score)
    return matched_entities, matches_names, scores



In [10]:
query = "Tell me about Vishnu and Garuda in battle"
for query_word in query.split():
  query_entities, matches_names, scores = extract_query_entities(query_word.lower(), entity_index)
  for q, ent, score in zip(query_entities, matches_names, scores):
    print(q, query_word, ent, score)


['e12207', 'e12208', 'e12209'] Vishnu vishnu 100.0
['e4059', 'e4060', 'e4061', 'e4062'] Garuda garuda 100.0


In [12]:
with open('Dataset/Corpus/entity_index.json', 'w') as f:
    json.dump(entity_index, f, indent=4)

In [22]:
for k in list(entity_index.keys())[8500:8800]:
  print(k, entity_index[k])

vishvarupadhrrik e11944
vishvasambhava e11945
vishvasattama e11946
vishvasena e11947
vishvasrrij e1194820
vishvatman e11949
vishvatomukha e11950
vishvavasa e11951
vishvavasu e11954
vishvavasumokshana e11955
vishvavati e11956
vishvayoni e11957
vishvayu e11958
vishvedevah e11959
vishvesha e11960
vishveshvara e11961
vishveshvarasthana e11962
vishvedeva e11964
vishvopakhyana e11965
vidanda e11966
vidabha e11967
vidarana e11968
vidarbha e11971
vidarbhadhipanandini e11972
vidarbhadhipati e11973
vidarbhapati e11974
vidarbharaj e11975
vidarbharaja e11978
vidarbharajan e11979
vidarbharajatanaya e11980
vidarbhatanaya e11981
videha e11983
videharaja e11985
videharajaduhitrri e11986
videharajan e11987
vidhana e11988
vidharma e11989
vidhatrri e11991
vidheyatman e11992
vidhi e11993
vidisha e11994
vidishah e11995
vidigbhanu e11996
vidula e11997
vidulaputrashasana e11998
vidura e12000
viduragamana e12001
viduragamanaparvan e12002
viduraparinaya e12003
viduratha e12005
vidurathasuta e12006
vidvat e1200

In [None]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from collections import defaultdict

# Example input
# entities_speakers_verses = {
#     "1.1.1": {"entities": ["E1--rAma", "E2--sItA"], "speaker": "vAlmIki"},
#     "1.1.2": {"entities": ["E1--rAma", "E3--hanumAn"], "speaker": "vAlmIki"},
#     ...
# }

# Create chapter-wise aggregator
chapter_data = defaultdict(lambda: defaultdict(int))  # chap_id -> {entity: count}

for key, val in entities_speakers_verses.items():
    vol, chap, verse = key.split('.')
    chap_key = f"{vol}.{chap}"

    # Process entities
    for e in val.get("entities", []):
        parts = e.split('--')
        if len(parts) == 2:
            eid, name = parts
            name_iast = transliterate(name, sanscript.SLP1, sanscript.ITRANS).lower()
            entity_key = f"{eid}--{name_iast}"
        else:
            entity_key = e

        chapter_data[chap_key][entity_key] += 1  # increment occurrence count

# Convert defaultdicts to normal dicts
chapter_data = {chap: dict(entities) for chap, entities in chapter_data.items()}




In [None]:
chapter_data

{'1.1': {'e12207-person--vishnu': 5,
  'e4679-person--itihasa': 1,
  'e7587-person--nara': 2,
  'e9540-person--sarasvati': 1,
  'e2579-person--shaunaka': 1,
  'e7498-person--naimisha': 2,
  'e8267-misc--pauranika': 1,
  'e9817-person--sauti': 11,
  'e11327-person--vaishampayana': 4,
  'e4777-person--janamejaya': 3,
  'e6721-misc--mahabharata': 16,
  'e6276-location--kurukshetra': 1,
  'e6239-person--kuru': 3,
  'e7969-person--pandava': 15,
  'e1743-person--brahman': 5,
  'e8769-misc--purana': 9,
  'e10362-person--sura': 2,
  'e1782-person--brahmarshi': 2,
  'e11773-misc--veda': 13,
  'e12416-person--vyasa': 9,
  'e1697-person--brahma': 1,
  'e4679-misc--itihasa': 5,
  'e9365-misc--samhita': 1,
  'e4618-person--indra': 6,
  'e5907-person--krrishna': 23,
  'e8806-person--purusha': 2,
  'e12635-misc--yuga': 4,
  'e2661-person--shiva': 3,
  'e11959-person--vishvedevah': 1,
  'e123-person--ashvin': 1,
  'e1741-person--brahman': 9,
  'e177-person--aditya': 1,
  'e3015-person--daksha': 1,
  '

In [None]:
import re
from collections import defaultdict

def standardize_entities(chapter_data, cluster_map):
    standardized = {}

    for chap_id, entities in chapter_data.items():
        new_entities = defaultdict(int)

        for ent, count in entities.items():
            # Match things like e2304-person--shakuni
            m = re.match(r"^(e\d+)(-[^-]+)?(--.+)?$", ent, re.IGNORECASE)
            if not m:
                continue

            eid, mid_part, suffix = m.groups()
            mid_part = mid_part or ""   # e.g. '-person'
            suffix = suffix or ""       # e.g. '--shakuni'

            # Map to cluster head (fallback to itself)
            head = cluster_map.get(eid, eid)

            # Build standardized entity key
            standardized_key = f"{head}{mid_part}{suffix}"

            # Increment occurrence count
            new_entities[standardized_key] += count

        standardized[chap_id] = dict(new_entities)

    return standardized


In [None]:
standardized = standardize_entities(chapter_data, entity_map)
standardized

{'1.1': {'e12207-person--vishnu': 5,
  'e4679-person--itihasa': 1,
  'e7587-person--nara': 2,
  'e9540-person--sarasvati': 1,
  'e2579-person--shaunaka': 1,
  'e7498-person--naimisha': 2,
  'e8267-misc--pauranika': 1,
  'e9817-person--sauti': 11,
  'e11327-person--vaishampayana': 4,
  'e4777-person--janamejaya': 3,
  'e6721-misc--mahabharata': 16,
  'e6276-location--kurukshetra': 1,
  'e6239-person--kuru': 3,
  'e7969-person--pandava': 15,
  'e1743-person--brahman': 5,
  'e8769-misc--purana': 9,
  'e10362-person--sura': 2,
  'e1782-person--brahmarshi': 2,
  'e11773-misc--veda': 13,
  'e12416-person--vyasa': 9,
  'e1697-person--brahma': 1,
  'e4679-misc--itihasa': 5,
  'e9365-misc--samhita': 1,
  'e4618-person--indra': 6,
  'e5907-person--krrishna': 23,
  'e8806-person--purusha': 2,
  'e12635-misc--yuga': 4,
  'e2661-person--shiva': 3,
  'e11959-person--vishvedevah': 1,
  'e123-person--ashvin': 1,
  'e1741-person--brahman': 9,
  'e177-person--aditya': 1,
  'e3015-person--daksha': 1,
  '

In [14]:
entity_map['e12207']

'e12207'

In [None]:
with open('Dataset/Corpus/chapter_entities.json', 'w') as f:
    json.dump(standardized, f, indent=4)