----

# Installations

In [1]:
!pip install faiss-cpu rouge_score neo4j langchain_community sentence-transformers transformers

Defaulting to user installation because normal site-packages is not writeable




--------

# Please enter Your Groq Api Key and Your NEO4J credentials

In [None]:
import requests
import re
from dateutil import parser
from datetime import datetime
import json
import os
import openai
from neo4j import GraphDatabase
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import unicodedata
import numpy as np
from numpy.linalg import norm
from kaggle_secrets import UserSecretsClient
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import time
from datetime import datetime, timedelta
from collections import defaultdict, deque
import pickle
from sentence_transformers import SentenceTransformer, util
import calendar
import faiss
import spacy
import ast


nlp = spacy.load("en_core_web_sm")
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')
except LookupError:
    nltk.download('punkt')
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("NEO4J_PASSWORD")
secret_value_1 = user_secrets.get_secret("NEO4J_USERNAME")
secret_value = UserSecretsClient().get_secret("GROQ_API_KEY")
GROQ_API_KEY = secret_value
MODEL = "llama3-70b-8192"   
embedder = SentenceTransformer("all-MiniLM-L6-v2")
NEO4J_URI = "neo4j+s://5d3576c6.databases.neo4j.io"  # Or your Neo4j Aura URI
NEO4J_USER = secret_value_1
NEO4J_PASSWORD = secret_value_0
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


----

# Main Code for  Manual Graph  Class

In [37]:
class Node:
    def __init__(self, subject):
        self.subject = subject
        self.edges = defaultdict(list)  # predicate -> list of objects
        self.predicate_embeddings = {}  # predicate -> embedding vector

    def add_edge(self, predicate, object_, embedder):
        self.edges[predicate].append(object_)
        if predicate not in self.predicate_embeddings:
            self.predicate_embeddings[predicate] = embedder.encode(predicate, convert_to_numpy=True)

class Hash:
    capacity = 10000
    similarity_threshold = 0.5
    
    def __init__(self, initial_date):
        self.start_index = self.date_to_value(initial_date)
        self.arr = [None] * self.capacity
        self.size = 0
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.subject_embeddings = {}  # New: subject -> embedding mapping
        self.subject_index = {}       # New: for fast similarity search
        self._subject_embeddings_updated = False  # Track if embeddings need reindexing
        

    def date_to_value(self, date):
        date_obj = datetime.strptime(date, "%m/%d/%Y")
        year = date_obj.year
        month = date_obj.month
        day = date_obj.day
        value = year * 372 + month * 31 + day
        return value

    def insert_news(self, date, triplets):
        value = self.date_to_value(date)
        index = value - self.start_index

        if self.arr[index] is None:
            self.arr[index] = {}  # New graph: subject -> Node

        graph = self.arr[index]

        for triplet in triplets:
            subject = triplet['subject']
            predicate = triplet['predicate']
            object_ = triplet['object']

            if subject not in graph:
                graph[subject] = Node(subject)
                # Add subject to embedding store if not already present
                if subject not in self.subject_embeddings:
                    self.subject_embeddings[subject] = self.embedder.encode(subject, convert_to_numpy=True)
                    self._subject_embeddings_updated = True

            graph[subject].add_edge(predicate, object_, self.embedder)

        self.size += 1

    def _build_subject_index(self):
        """Build a FAISS index for fast subject similarity search"""
        if not self._subject_embeddings_updated:
            return
            
        try:
            import faiss
        except ImportError:
            print("FAISS not available, using brute-force search")
            self.subject_index = None
            self._subject_embeddings_updated = False
            return

        embeddings = np.array(list(self.subject_embeddings.values())).astype('float32')
        self.subject_index = faiss.IndexFlatIP(embeddings.shape[1])
        self.subject_index.add(embeddings)
        self._subject_embeddings_updated = False
        print("Subject index rebuilt")

    def find_similar_subjects(self, query_subject, threshold=None):
        """Find all subjects similar to query_subject, sorted by similarity"""
        if threshold is None:
            threshold = self.similarity_threshold

        if not self.subject_embeddings:
            return []

        query_embedding = self.embedder.encode(query_subject, convert_to_numpy=True)
        
        # Try using FAISS if available
        if self.subject_index is not None:
            self._build_subject_index()
            query_embedding = np.array([query_embedding]).astype('float32')
            D, I = self.subject_index.search(query_embedding, len(self.subject_embeddings))
            
            subjects = list(self.subject_embeddings.keys())
            results = []
            for i, score in zip(I[0], D[0]):
                if i == -1 or score < threshold:
                    continue
                results.append((subjects[i], score))
            return sorted(results, key=lambda x: -x[1])
        else:
            # Fallback to brute-force search
            results = []
            for subject, embedding in self.subject_embeddings.items():
                score = util.cos_sim(query_embedding, embedding).item()
                if score >= threshold:
                    results.append((subject, score))
            return sorted(results, key=lambda x: -x[1])
    
    def search(self, date, subject_query, predicate_query,question_type, max_hops=4,threshold=None):
        """Search with soft matching for both subject and predicate.
        Returns the highest-scored answer even if below threshold when no matches meet the threshold."""
        if threshold is None:
            threshold = self.similarity_threshold
    
        value = self.date_to_value(date)
        index = value - self.start_index
        graph = self.arr[index]
        if graph is None:
            return []
        mytriplets = []
        for subject, node in graph.items():
            for predicate, objects in node.edges.items():
                for obj in objects:
                    mytriplets.append( { "subject": subject, "predicate": predicate, "object": obj })
        if(question_type == "Date"):
            mytriplets = filter_triplets_with_dates_or_numbers(mytriplets)
        elif(question_type == "Person"):
            mytriplets = filter_triplets_with_names_or_locations(mytriplets)

        if question_type in ["Date", "Person"]:
            filtered_graph = {}
            for triplet in mytriplets:
                subject = triplet['subject']
                predicate = triplet['predicate']
                obj = triplet['object']
                
                if subject not in filtered_graph:
                    filtered_graph[subject] = Node(subject)
                    if subject in self.subject_embeddings:
                        filtered_graph[subject].predicate_embeddings = graph[subject].predicate_embeddings.copy()
                
                filtered_graph[subject].edges[predicate].append(obj)
                if predicate not in filtered_graph[subject].predicate_embeddings:
                    filtered_graph[subject].predicate_embeddings[predicate] = \
                        graph[subject].predicate_embeddings.get(predicate, 
                        self.embedder.encode(predicate, convert_to_numpy=True))
        
            graph = filtered_graph
    
        # Find all matching subjects (including exact match)
        similar_subjects = self.find_similar_subjects(subject_query, threshold=0)  # Get all possible matches
        if not similar_subjects:
            return []
    
        results = []
        query_pred_embedding = self.embedder.encode(predicate_query, convert_to_numpy=True)
        
        for matched_subject, subject_score in similar_subjects:
            if matched_subject not in graph:
                continue
    
            node = graph[matched_subject]
            
            # Check for exact predicate match first
            if predicate_query in node.edges:
                for obj in node.edges[predicate_query]:
                    results.append({
                        'subject': matched_subject,
                        'subject_score': subject_score,
                        'predicate': predicate_query,
                        'predicate_score': 1.0,
                        'object': obj,
                        'path': None,
                        'hops': 0,
                        'total_score': subject_score + 1.0
                    })
            
            # Perform soft matching for predicates
            for pred, emb in node.predicate_embeddings.items():
                pred_score = util.cos_sim(query_pred_embedding, emb).item()
                # Only proceed if there are objects for this predicate
                if pred in node.edges:
                    for obj in node.edges[pred]:
                        results.append({
                            'subject': matched_subject,
                            'subject_score': subject_score,
                            'predicate': pred,
                            'predicate_score': pred_score,
                            'object': obj,
                            'path': None,
                            'hops': 0,
                            'total_score': subject_score + pred_score
                        })
    
            # Multi-hop search if enabled
            if max_hops > 0:
                visited = set()
                queue = deque()
                queue.append((node, [], 0))
                
                while queue:
                    current_node, path, hop_count = queue.popleft()
                    
                    if current_node.subject in visited or hop_count >= max_hops:
                        continue
                        
                    visited.add(current_node.subject)
                    
                    for pred, objects in current_node.edges.items():
                        pred_score = util.cos_sim(query_pred_embedding, current_node.predicate_embeddings[pred]).item()
                        
                        for obj in objects:
                            full_path = path + [pred]
                            results.append({
                                'subject': matched_subject,
                                'subject_score': subject_score,
                                'predicate': pred,
                                'predicate_score': pred_score,
                                'object': obj,
                                'path': full_path,
                                'hops': hop_count + 1,
                                'total_score': subject_score + pred_score
                            })
                        
                        # Add neighbors to queue
                        for obj in objects:
                            if obj in graph:
                                queue.append((graph[obj], path + [pred], hop_count + 1))
    
        # Sort all results by total score descending
        results.sort(key=lambda x: -x['total_score'])
        
        # Return results that meet the threshold if any exist
        threshold_met_results = [r for r in results if r['subject_score'] >= threshold and r['predicate_score'] >= threshold]
        
        if threshold_met_results:
            return threshold_met_results
        elif results:  # Return top result even if below threshold
            return [results[0]]
        return []

    def search_month(self, month, year, subject, predicate_query,question_type, max_hops=4,threshold=None):
        """Search across a whole month with soft matching"""
        if threshold is None:
            threshold = self.similarity_threshold

        _, num_days = calendar.monthrange(year, month)
        all_results = []

        for day in range(1, num_days + 1):
            date_str = f"{month:02d}/{day:02d}/{year}"
            daily_results = self.search(date_str, subject, predicate_query,question_type, max_hops,threshold)
            all_results.extend(daily_results)

        return all_results


    def print_graph(self, date):
        value = self.date_to_value(date)
        index = value - self.start_index
        graph = self.arr[index]

        if not graph:
            print("No news for this date.")
            return

        for subject, node in graph.items():
            print(f"Subject: {subject}")
            for predicate, objects in node.edges.items():
                for obj in objects:
                    print(f"  --[{predicate}]--> {obj}")

    def print_graphs_10_days(self, start_date):
        start_value = self.date_to_value(start_date)
        start_index = start_value - self.start_index

        print(f"Graphs for 10 days starting from {start_date}")
        print("=" * 60)

        for day_offset in range(10):
            current_index = start_index + day_offset
            if current_index < 0 or current_index >= self.capacity:
                continue  # out of bounds

            graph = self.arr[current_index]

            base_date = datetime.strptime(start_date, "%m/%d/%Y")
            current_date = base_date + timedelta(days=day_offset)
            date_str = current_date.strftime("%m/%d/%Y")

            print(f"Date: {date_str}")
            print("-" * 60)

            if graph is None:
                print("  No news for this date.")
                print("-" * 60)
                continue

            for subject, node in graph.items():
                print(f"  Subject: {subject}")
                for predicate, objects in node.edges.items():
                    for obj in objects:
                        print(f"    - [{predicate}] -> {obj}")
            print("-" * 60)
    def save(self, filepath):
        with open(filepath, "wb") as f:
            pickle.dump(self, f)
        print(f"Hash object saved to {filepath}.")

    @staticmethod
    def load(filepath):
        with open(filepath, "rb") as f:
            obj = pickle.load(f)
        print(f"Hash object loaded from {filepath}.")
        return obj

--------------

# If you Have triplets and want to build Manual Graph then insert your data here 

Expected DF format  



Sequential_Date       |      triplets   



Note We assume that the date is mm/dd/yyyy

                  

In [None]:
date_knowledge_graphs = {}
bad_indexes = []

for index, row in triplets_with_dates.iterrows():
    try:
        # 1. Convert date format
        formatted_date = datetime.strptime(row['Sequential_Date'], '%m-%d-%Y').strftime('%m/%d/%Y')
        
        # 2. Process triplets
        raw_triplets = row['triplets']
        
        # Convert string representation if needed
        if isinstance(raw_triplets, str):
            try:
                triplets = ast.literal_eval(raw_triplets.strip())
            except Exception as e:
                bad_indexes.append(index)
                print(f"Row {index}: Parse error - {str(e)}")
                continue
        else:
            triplets = raw_triplets

        # Validate triplets structure
        is_valid = (
            isinstance(triplets, list) and 
            all(
                isinstance(t, dict) and 
                {'subject', 'predicate', 'object'}.issubset(t.keys())
                for t in triplets
            )
        )
        
        if not is_valid:
            print(f"Row {index}: Invalid triplets format")
            print(f"Sample: {str(triplets)[:200]}")
            bad_indexes.append(index)
            continue
            
        # Create inverted triplets
        inverted_triplets = []
        for triplet in triplets:
            inverted_triplet = {
                'subject': triplet['object'],
                'predicate': triplet['predicate'], 
                'object': triplet['subject']
            }
            inverted_triplets.append(inverted_triplet)
        
        # Insert both original and inverted triplets
        sohailo.insert_news(formatted_date, inverted_triplets)  # Inverted
        
    except Exception as e:
        print(f"\nCritical error in row {index}: {str(e)}")
        bad_indexes.append(index)
        if 'triplets' in locals():
            print(f"Triplets type: {type(triplets)}")
            print(f"Triplets content: {str(triplets)[:200]}...")
        continue

print(f"\nSuccessfully processed {len(triplets_with_dates) - len(bad_indexes)} rows")
print(f"Failed to process {len(bad_indexes)} rows")

In [None]:
triplets_with_dates = triplets_with_dates.drop(index=bad_indexes)  # Removes rows 1, 3, and 5

In [None]:
triplets_with_dates.to_csv("cleaned date data.csv")

# save the model into a pkl file 

In [None]:
kg2 = Hash("01/01/2000")

In [None]:
kg2.save("magdySquad.pkl") 

------

# If you already have your Graph saved as pkl file Just Load dont Build and save

In [38]:
kg2 = Hash("01/01/2000")
Manual_Graph = kg2.load("C:\\Users\\ADMIN\\Downloads\\finalSquad_100.pkl")

Hash object loaded from C:\Users\ADMIN\Downloads\finalSquad_100.pkl.


In [7]:
type(Manual_Graph)

__main__.Hash

In [8]:
Manual_Graph.print_graphs_10_days('1/4/2018')

Graphs for 10 days starting from 1/4/2018
Date: 01/04/2018
------------------------------------------------------------
  Subject: individual and group identity
    - [is] -> psychologists
    - [is] -> sociologists
    - [is] -> anthropologists
  Subject: a structural representation of the individual's existential experience
    - [is] -> Weinreich's Identity Structure Analysis (ISA)
    - [is] -> Weinreich's Identity Structure Analysis
  Subject: organised in relatively stable structures over time
    - [are] -> the relationships
  Subject: in which self relates to other agents and institutions
    - [is] -> the socio-cultural milieu
  Subject: constructs drawn from the salient discourses of the individual, the group and cultural norms
    - [uses] -> the individual
  Subject: a methodology that maps how these are used by the individual
    - [provides] -> the practical operationalisation of ISA
  Subject: self and other agents and institutions
    - [appraises] -> the individual
   

---

# We need some sort of normalizing to the dates e.g. Jan 1 2025 to 1/1/2025

In [9]:
def normalize_date(date_text):
    try:

        original_text_for_output = date_text.strip()
        cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_text.strip())        
        dt = None
        try:
            dt = parser.parse(cleaned, fuzzy=True, dayfirst=False, default=datetime(1900, 1, 1))
        except Exception: # General fallback, though primary parsing should handle most fuzzy cases
            dt = parser.parse(cleaned, fuzzy=True, default=datetime(1900, 1, 1)) # Default parsing
        
        if re.fullmatch(r'\d{4}', cleaned): # Use fullmatch for year-only
            return {"normalized_date": f"01/01/{cleaned}", "type": 3, "original": original_text_for_output}
        elif re.fullmatch(r'(?:January|February|March|April|May|June|July|August|September|October|November|December)[a-z]*\s+\d{4}', cleaned, re.IGNORECASE):
            month_year = parser.parse(cleaned, default=datetime(1900,1,1)) # Re-parse for month_year object for safety
            return {"normalized_date": month_year.strftime("%m/01/%Y"), "type": 1, "original": original_text_for_output}
        else:
            return {"normalized_date": dt.strftime("%m/%d/%Y"), "type": 0, "original": original_text_for_output}
    except Exception as e:
        # print(f"Failed to parse date '{date_text}': {str(e)}") # You can uncomment this for debugging
        # Return None if any unhandled exception occurs during parsing or formatting
        return {"normalized_date": None, "type": 5, "original": date_text, "error": str(e)}

In [10]:
if __name__ == '__main__':
    test_dates = [
        "2023",                            # Year only
        "March 2022",                      # Month Year
        "10th January 2021",               # Full date (DMY-like input)
        "05/15/2020",                      # Full date (MM/DD/YYYY input)
        "15/05/2020",                      # Full date (DD/MM/YYYY input, dayfirst=False will parse as May 15th)
        "2nd Feb 2019",                    # Full date
        "July 4 2000",                     # Full date
        "Invalid Date",                    # Invalid
        "Tomorrow",                        # Fuzzy, dateutil might parse this if not specific enough
        "1999-12-31",                      # ISO format
        "01-02-2023"                       # Ambiguous without dayfirst hint (parsed as Jan 2nd by default)
    ]

    print("Testing normalize_date function (outputting MM/DD/YYYY for full dates):")
    for date_str in test_dates:
        normalized = normalize_date(date_str)
        print(f"Original: '{date_str}' -> Normalized: {normalized}")

    print("\n--- Specific Test for DD/MM/YYYY style input with dayfirst=False (default) ---")
    # With dayfirst=False (default for parser.parse), "01/02/YYYY" is January 2nd.
    # "13/01/YYYY" would be January 13th (unambiguous).
    ambiguous_date_1 = "02/03/2024" # Expected: 02/03/2024 (Feb 3rd) because dayfirst=False
    ambiguous_date_2 = "14/03/2024" # Expected: 03/14/2024 (March 14th)
    
    print(f"Original: '{ambiguous_date_1}' -> Normalized: {normalize_date(ambiguous_date_1)}")
    print(f"Original: '{ambiguous_date_2}' -> Normalized: {normalize_date(ambiguous_date_2)}")


Testing normalize_date function (outputting MM/DD/YYYY for full dates):
Original: '2023' -> Normalized: {'normalized_date': '01/01/2023', 'type': 3, 'original': '2023'}
Original: 'March 2022' -> Normalized: {'normalized_date': '03/01/2022', 'type': 1, 'original': 'March 2022'}
Original: '10th January 2021' -> Normalized: {'normalized_date': '01/10/2021', 'type': 0, 'original': '10th January 2021'}
Original: '05/15/2020' -> Normalized: {'normalized_date': '05/15/2020', 'type': 0, 'original': '05/15/2020'}
Original: '15/05/2020' -> Normalized: {'normalized_date': '05/15/2020', 'type': 0, 'original': '15/05/2020'}
Original: '2nd Feb 2019' -> Normalized: {'normalized_date': '02/02/2019', 'type': 0, 'original': '2nd Feb 2019'}
Original: 'July 4 2000' -> Normalized: {'normalized_date': '07/04/2000', 'type': 0, 'original': 'July 4 2000'}
Original: 'Invalid Date' -> Normalized: {'normalized_date': None, 'type': 5, 'original': 'Invalid Date', 'error': 'String does not contain a date: Invalid Da

In [11]:
# Basic regex for detecting dates, numbers, and number words
DATE_REGEX = r"\b(\d{1,2}[ \-/])?(January|February|March|April|May|June|July|August|September|October|November|December)[ \-/]\d{2,4}\b|\b\d{4}\b|\b\d{1,4}[ ]?B\.?C\.?\b"
NUMERIC_REGEX = r"\b\d+\b|\bmillion\b|\bbillion\b|\bthousand\b|\bhundred\b|\bfew\b|\bseveral\b|\bone\b|\btwo\b|\bthree\b|\bfour\b|\bfive\b|\bsix\b|\bseven\b|\beight\b|\bnine\b|\bten\b"

# Combine both into one for checking subject or object
COMBINED_REGEX = f"({DATE_REGEX})|({NUMERIC_REGEX})"

def filter_triplets_with_dates_or_numbers(triplets):
    filtered = []
    for triplet in triplets:
        subj = triplet["subject"].lower()
        obj = triplet["object"].lower()
        if re.search(COMBINED_REGEX, subj) or re.search(COMBINED_REGEX, obj):
            filtered.append(triplet)
    return filtered
triplets = [
  {"subject": "crowd", "predicate": "jeer", "object": "Royal March"},
  {"subject": "jeering of Royal March", "predicate": "occur", "object": "14 June 1925"},
  {"subject": "jeering of Royal March", "predicate": "be", "object": "spontaneous reaction against Primo de Rivera's dictatorship"},
  {"subject": "ground", "predicate": "be closed for", "object": "six months"},
  {"subject": "Gamper", "predicate": "relinquish", "object": "presidency of the club"},
  {"subject": "event", "predicate": "coincide with", "object": "transition to professional football"},
  {"subject": "directors of Barcelona", "predicate": "claim", "object": "to operate a professional football club"},
  {"subject": "claim", "predicate": "occur", "object": "1926"},
  {"subject": "club", "predicate": "hold", "object": "testimonial match for Paulino Alcántara"},
  {"subject": "testimonial match", "predicate": "occur", "object": "3 July 1927"},
  {"subject": "opponent in testimonial match", "predicate": "be", "object": "Spanish national team"},
  {"subject": "Josep Canudas", "predicate": "drop", "object": "ball onto the pitch"},
  {"subject": "Josep Canudas", "predicate": "be", "object": "local journalist and pilot"},
  {"subject": "ball drop", "predicate": "occur from", "object": "airplane"},
  {"subject": "victory", "predicate": "occur", "object": "1928"},
  {"subject": "victory", "predicate": "be celebrated with", "object": "poem titled 'Oda a Platko'"},
  {"subject": "'Oda a Platko'", "predicate": "be written by", "object": "Rafael Alberti"},
  {"subject": "Rafael Alberti", "predicate": "be", "object": "member of Generation of '27"},
  {"subject": "poem", "predicate": "be inspired by", "object": "heroic performance of Barcelona goalkeeper Franz Platko"},
  {"subject": "Barcelona", "predicate": "win", "object": "inaugural Spanish League"},
  {"subject": "win", "predicate": "occur", "object": "23 June 1929"},
  {"subject": "Gamper", "predicate": "commit", "object": "suicide"},
  {"subject": "suicide", "predicate": "occur", "object": "30 July 1930"},
  {"subject": "depression", "predicate": "be caused by", "object": "personal and financial problems"},
  {"subject": "depression", "predicate": "precede", "object": "Gamper's suicide"}
]

filtered = filter_triplets_with_dates_or_numbers(triplets)
for t in filtered:
    print(t)


{'subject': 'jeering of Royal March', 'predicate': 'occur', 'object': '14 June 1925'}
{'subject': 'ground', 'predicate': 'be closed for', 'object': 'six months'}
{'subject': 'claim', 'predicate': 'occur', 'object': '1926'}
{'subject': 'testimonial match', 'predicate': 'occur', 'object': '3 July 1927'}
{'subject': 'victory', 'predicate': 'occur', 'object': '1928'}
{'subject': 'Rafael Alberti', 'predicate': 'be', 'object': "member of Generation of '27"}
{'subject': 'win', 'predicate': 'occur', 'object': '23 June 1929'}
{'subject': 'suicide', 'predicate': 'occur', 'object': '30 July 1930'}


In [12]:
def classify_question(question: str) -> str:
    q = question.lower()

    # Person-focused questions
    if re.search(r"\bwho\b", q):
        return "Person"
    if re.search(r"\bwho (wrote|composed|created|painted|founded|led|directed)\b", q):
        return "Person"

    # Date-focused questions (more specific)
    if re.search(r"\b(on what (day|date|year)|what year|when|how long|how many|how much|what years)\b", q):
        return "Date"
    if re.match(r"^in \d{4}", q) and not re.search(r"\bwhat (incident|event|happened|caused)\b", q):
        return "Date"

    # Default fallback
    return "Other"

questions = ["Between what year's did Victorian aboriginal groups dispossessed?", 'In 1844 how many Aborigines resident in squalid camps in Melbourne?', 'Who appointed five Aboriginal Protectors for the Aborigines of Victoria in 1839?', 'Who were the people that were power political and economic force in Victoria in 1845?', 'How many Aborigines were said to be resident in squalid camps in Melbourne in January 1844?', 'How many Aboriginal Protectors for the Aborigines of Victoria were appointed in 1839?', 'By what year did fewer than 240 wealthy Europeans hold all the pastoral licenses?', 'During what years were Victorian Aboriginal groups largely displaced from their land?']
for q in questions:
    print(f"{q} -> {classify_question(q)}")

Between what year's did Victorian aboriginal groups dispossessed? -> Date
In 1844 how many Aborigines resident in squalid camps in Melbourne? -> Date
Who appointed five Aboriginal Protectors for the Aborigines of Victoria in 1839? -> Person
Who were the people that were power political and economic force in Victoria in 1845? -> Person
How many Aborigines were said to be resident in squalid camps in Melbourne in January 1844? -> Date
How many Aboriginal Protectors for the Aborigines of Victoria were appointed in 1839? -> Date
By what year did fewer than 240 wealthy Europeans hold all the pastoral licenses? -> Date
During what years were Victorian Aboriginal groups largely displaced from their land? -> Date


# Extract the entities and the Dates here from the User question as the first step 

In [13]:
def extract_entities_dates_predicate_groq(question):
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Updated system prompt for clarity on 'entities' being a list of strings
    system_prompt_content = """You are an expert at extracting structured information. Your goal is to populate a JSON object with 'entities', 'dates', and a 'predicate' based on the user's question.

Follow these rules precisely:
1.  **Identify Subjects for "entities"**:
    * Mentally break down the question to identify the main subject(s) of the core action or description. These subjects will be your "entities".
    * Subjects can be proper nouns (e.g., "Obama", "Paris"), or general noun phrases (e.g., "the man", "the red car", "companies", "the project manager").
    * If a question asks about "who" or "what" performed an action and the actor/subject is specified in the question, list that actor/subject.
    * If the subject is implicit (e.g., in "what happened?"), or if the question asks "who" or "what" and the subject is the answer being sought, the "entities" list should typically be empty.
2.  **Extract "dates"**:
    * Extract ALL date mentions in any format (e.g., "2nd March 2020", "May 2012", "2020", "yesterday", "last Monday", "first day", "Q2 2023").
    * Preserve original textual representation of dates, including ordinals (e.g., "2nd", "24th", "1st"). This should be a list of strings.
3.  **Determine "predicate"**:
    * Identify the main verb or action phrase that describes the central event or the relationship involving the primary subject(s). This will be the "predicate". This should be a single string.
    * For questions about roles or states (e.g., "who is the CEO of X"), the predicate might be a phrase like "is CEO of" or simply "is".
4.  **Output Format**:
    * Return a single, valid JSON object.
    * The JSON object must have exactly these three keys:
        * "entities": A list of strings. This field **must always be a list**, even if it contains only one subject (e.g., `["subject1"]`) or is empty (e.g., `[]`). Each string in the list should be a distinct, non-empty entity/subject.
        * "dates": A list of strings. Each string should be an extracted date phrase.
        * "predicate": A single string (or null if not clearly applicable).
"""

    user_prompt_content = f"""Extract from this question: entities, dates, and predicate.
Examples:
"what happened on 2nd March 2020?" → {{"entities": [], "dates": ["2nd March 2020"], "predicate": "happened"}}
"what did Obama say in May 2012?" → {{"entities": ["Obama"], "dates": ["May 2012"], "predicate": "say"}}
"What tasks did the project manager assign last Monday?" → {{"entities": ["the project manager"], "dates": ["last Monday"], "predicate": "assign"}}
"When was the old bridge closed for repairs?" → {{"entities": ["the old bridge"], "dates": ["When"], "predicate": "closed for repairs"}} 

Question: "{question}"
"""
    
    payload = {
        "model": "llama-3.1-8b-instant",
        "messages": [
            {
                "role": "system",
                "content": system_prompt_content
            },
            {
                "role": "user",
                "content": user_prompt_content
            }
        ],
        "temperature": 0,
        "max_tokens": 400,
        "response_format": {"type": "json_object"}
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        data = response.json()
        content = data['choices'][0]['message']['content']
        
        extracted = None
        try:
            extracted = json.loads(content)
        except json.JSONDecodeError:
            match = re.search(r'\{.*\}', content, re.DOTALL)
            if match:
                extracted = json.loads(match.group())
            else:
                raise ValueError("No valid JSON found in response")

        # Robust handling for 'entities' to ensure it's a list of non-empty strings
        final_entities = []
        raw_entities_value = extracted.get("entities")
        if isinstance(raw_entities_value, str):
            if raw_entities_value.strip(): # If it's a non-empty string
                final_entities.append(raw_entities_value.strip())
        elif isinstance(raw_entities_value, list):
            for item in raw_entities_value:
                if isinstance(item, str) and item.strip(): # Ensure item is a non-empty string
                    final_entities.append(item.strip())
        # If raw_entities_value is None, or an empty string/list, or contains only non-strings/empty-strings,
        # final_entities will remain an empty list as initialized.

        # Process dates
        final_dates = []
        if "dates" in extracted and extracted["dates"]:
            for date_text in extracted.get("dates", []):
                if date_text: 
                    norm = normalize_date(str(date_text)) 
                    if norm:
                        final_dates.append(norm)
        
        if not final_dates:
            final_dates.append({"normalized_date": None, "type": 4, "original": None})

        return {
            "entities": final_entities, # Use the robustly processed list of strings
            "dates": final_dates,
            "predicate": extracted.get("predicate", None)
        }
        
    except Exception as e:
        # print(f"Error in Groq API call or processing: {str(e)}") # Optional: uncomment for debugging
        return {
            "entities": [],
            "dates": [{"normalized_date": None, "type": 4, "original": None}],
            "predicate": None
        }

---

#  Defining neo4j Basic search Methods Needed in Test Cases 3 and 4

In [14]:
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (norm(a) * norm(b) + 1e-8)  # Prevent division by zero


def normalize_text(text):
    # Strip accents and convert to ASCII
    nfkd_form = unicodedata.normalize('NFKD', text)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])


def extract_entities_llm(query):
    prompt = (
        "Extract only the most relevant entity or keyword from the following question.\n"
        "Return just the name(s), without any extra text or explanation.\n\n"
        f"Question: {query}\n\nEntities:"
    )
    raw_entities = query_groq_llm(prompt).strip()

    # Clean output: remove (tags), bullets, quotes, and extra whitespace
    cleaned = re.sub(r"\s*\([^)]*\)", "", raw_entities)     # Remove (type)
    cleaned = re.sub(r"^\*+", "", cleaned)                  # Remove bullets like "* "
    cleaned = re.sub(r"[\"'`]", "", cleaned)                # Remove quotes
    cleaned = re.sub(r"\s+", " ", cleaned).strip()          # Normalize spaces
    return cleaned




def query_groq_llm(prompt):
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": MODEL,
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }
    response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
    response_json =  response.json()
    
    if "choices" in response_json and response_json["choices"] and "message" in response_json["choices"][0] and "content" in response_json["choices"][0]["message"]:
        return response.json()["choices"][0]["message"]["content"]
    else : 
        return response_json


def answer_query(query):
    entities = extract_entities_llm(query)
    entities = [normalize_text(e.strip()) for e in entities.split(",") if e.strip()]
    #print(f"[DEBUG] Extracted Entities: {entities}")
    
    # 2. Generate embeddings for each entity
    entity_embeddings = [embedder.encode(e).tolist() for e in entities]
    
    # 3. Fetch all candidate triplets with their embeddings
    with driver.session() as session:
        candidates = session.run(
            """
            MATCH (s)-[r]->(o)
            WHERE s.embedding IS NOT NULL AND o.embedding IS NOT NULL
            RETURN s.name AS subject, 
                   r.type AS predicate, 
                   o.name AS object,
                   s.embedding AS subj_emb,
                   o.embedding AS obj_emb ,
                   r.date  AS  date
            """
        ).data()
    #print(f"candidates length --> {len(candidates)}")
    # 4. Calculate similarities in Python
    all_triplets = set()
    for entity, emb in zip(entities, entity_embeddings):
        for record in candidates:
            try:
                subj_sim = cosine_similarity(emb, record["subj_emb"])
                obj_sim = cosine_similarity(emb, record["obj_emb"])
                
                if subj_sim > 0.6 or obj_sim > 0.6:  # Similarity threshold
                    triplet = f"({record['subject']}) -[{record['predicate']}]-> ({record['object']} Date:{record['date']})"
                    all_triplets.add(triplet)
            except Exception as e:
                print(f"Error processing record: {str(e)}")
                continue
    
    # 5. Prepare context
    if not all_triplets:
        triplet_str = "No relevant triplets found."
    else:
        triplet_str = "\n".join(all_triplets)
    
    #print(f"[DEBUG] Context Triplets:\n{triplet_str}")
    
    final_prompt = f"""Answer this query: {query}
    Using ONLY the following verified information from our knowledge graph:
    {triplet_str}
    
    If no relevant information exists, respond "I don't have verified information about this."
    """
    
    return query_groq_llm(final_prompt)

# Just Testing Out the connection to the neo4j to check whether there is a problem or not 

In [53]:
user_query = "what happend between Palestines and Israel ?"
answer_query(user_query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'A challenging query!\n\nAfter carefully searching through the provided knowledge graph, I was able to extract some information related to the query "what happened between Palestinians and Israel". Here are the relevant points:\n\n1. **Israel and Palestinian territories share land borders**: (The country) -[shares]-> (land borders with the Palestinian territories Date:None)\n\n2. **Palestinian territories are partially controlled by Israel**: (Palestinian territories) -[are partially controlled by]-> (Israel Date:None)\n\n3. **Palestinian territories are claimed by the State of Palestine**: (Palestinian territories) -[are claimed by]-> (State of Palestine Date:None)\n\n4. **Jerusalem is a recognized capital by Israel, but internationally unrecognized**: (Jerusalem) -[is]-> (self-designated capital of Israel Date:None), (Israeli sovereignty over Jerusalem) -[is]-> (internationally unrecognized Date:None)\n\n5. **Israel is a sovereign state in Western Asia**: (Israel) -[is]-> (a sovereig

----------

# The Main Method that is going to handle all the 4 cases 

1.  Case1 What happened on 1-1-2025 --> Use Manual Graph Search Strategy
2.  Cas2 What happend on Jan 2025 --> Use Manual Graph Search Strategy but we have to loop over all the days in this month
3.  Case3 What happened on 2025 --> Use neo4j semantic search
4.  Case4 what happend   --> Use neo4j semantic search

---

In [35]:
def answer_user_question(question,kg):
    # --- Step 1: Extract structured info
    question_type = classify_question(question)
    print(question_type)
    extracted = extract_entities_dates_predicate_groq(question)
    if not extracted["entities"] or not extracted["dates"]:
        return "Sorry, I couldn't understand the question."

    subject = extracted["entities"][0]
    
    predicate = extracted["predicate"]
    date_info = extracted["dates"][0]
    print("subject ",subject)
    print("verb ",predicate)
    print("date ",date_info)
    # --- Step 2: Determine which case we are going to answer (e.g. 1-2-3-4)
    if date_info['type'] == 0 :
        print(date_info["normalized_date"])
        triplet = kg.search(date_info["normalized_date"], subject, predicate,question_type)
        print("triplet: ",triplet)
        if triplet == []:
            return "Sorry, I couldn't find the answer in the news data."
        print(triplet[:5])
        triplet = triplet[0]
    elif date_info['type'] == 1 :
        date = date_info['normalized_date'] 
        month , _ ,year = date.split('/') 
        month , year = int(month) , int(year)
        triplet = kg.search_month(month, year,subject,predicate,question_type)[0]
        print(triplet)
        #print("triplet: ",triplet)
        if triplet is None:
            return "Sorry, I couldn't find the answer in the news data."
    elif date_info['type'] == 3 : 
        return answer_query(question)
    else: 
        #print(question)
        return answer_query(question)
        
    # --- Step 3: Use LLM to generate the final user answer
    answer = generate_answer_from_triplet(question, triplet)
    return answer

def generate_answer_from_triplet(user_question, triplet):

    subject, predicate, obj = triplet['subject'] , triplet['predicate'] , triplet['object']

    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "llama3-70b-8192",
        "messages": [
            {
                "role": "system",
                "content": "You are an assistant that ONLY generates short direct answers based on the given fact."
            },
            {
                "role": "user",
                "content": f"""
Using the information:

Subject: {subject}
Predicate: {predicate}
Object: {obj}

Write a very short natural sentence. Do NOT add explanations, do NOT say you are sorry. Only say the main information.

Example format:
"[subject] [predicate] [object]"

Answer:
"""
            }
        ],
        "temperature": 0.2,
        "max_tokens": 100
    }

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    data = response.json()

    final_answer = data['choices'][0]['message']['content']
    return final_answer

----------

# We are Going to test The 4 cases on 3 questions and here are their GT Answers Respectively

1. Poultry is the second most widely eaten type of meat globally



2. allowing people to self-identify as more than one ethnicity


3. subjects had more difficulty recalling collections of letters that were acoustically similar

----

# Case 1  MM/DD/YYYY

In [20]:
print("Predicted anwer using temporal graph rag : ",answer_user_question("How popular is poultry as a consumable among humans in 16 March 2018 ?",Manual_Graph))
print("ground truth answer : Poultry is the second most widely eaten type of meat globally")

Other
subject  poultry
verb  is
date  {'normalized_date': '03/16/2018', 'type': 0, 'original': '16 March 2018'}
03/16/2018
triplet:  [{'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'second most widely eaten type of meat globally', 'path': None, 'hops': 0, 'total_score': np.float32(1.9999999)}, {'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'second most widely eaten type of meat globally', 'path': None, 'hops': 0, 'total_score': np.float32(1.9999999)}, {'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'second most widely eaten type of meat globally', 'path': ['is'], 'hops': 1, 'total_score': np.float32(1.9999999)}, {'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'Poultry', 'path': ['is', 'is'], 'hops': 2, 'total_score':

In [22]:
print("Predicted anwer using temporal graph rag : ",answer_user_question("How had the Census Bureau changed its collection of data in 3 March 2018 ?",Manual_Graph))
print("ground truth answer : The Census Bureau changed its data collection by allowing people to self-identify as more than one ethnicity")

Other
subject  the Census Bureau
verb  changed its collection of data
date  {'normalized_date': '03/03/2018', 'type': 0, 'original': '3 March 2018'}
03/03/2018
triplet:  [{'subject': 'Census Bureau', 'subject_score': np.float32(0.97950697), 'predicate': 'changed', 'predicate_score': 0.4662553071975708, 'object': 'its data collection', 'path': None, 'hops': 0, 'total_score': np.float32(1.4457623)}]
[{'subject': 'Census Bureau', 'subject_score': np.float32(0.97950697), 'predicate': 'changed', 'predicate_score': 0.4662553071975708, 'object': 'its data collection', 'path': None, 'hops': 0, 'total_score': np.float32(1.4457623)}]
Predicted anwer using temporal graph rag :  The Census Bureau changed its data collection.
ground truth answer : The Census Bureau changed its data collection by allowing people to self-identify as more than one ethnicity


In [23]:
print("Predicted anwer using temporal graph rag : ",answer_user_question('What did conrad find about test subjects in 2 March 2018?',Manual_Graph))
print("ground truth answer : subjects had more difficulty recalling collections of letters that were acoustically similar")

Other
subject  conrad
verb  find
date  {'normalized_date': '03/02/2018', 'type': 0, 'original': '2 March 2018'}
03/02/2018
triplet:  [{'subject': 'Conrad (1964)', 'subject_score': np.float32(0.82585865), 'predicate': 'found', 'predicate_score': 0.642058789730072, 'object': 'test subjects had more difficulty recalling collections of letters that were acoustically similar', 'path': None, 'hops': 0, 'total_score': np.float32(1.4679174)}, {'subject': 'Conrad (1964)', 'subject_score': np.float32(0.82585865), 'predicate': 'found', 'predicate_score': 0.642058789730072, 'object': 'test subjects had more difficulty recalling collections of letters that were acoustically similar', 'path': ['found'], 'hops': 1, 'total_score': np.float32(1.4679174)}, {'subject': 'Conrad (1964)', 'subject_score': np.float32(0.82585865), 'predicate': 'found', 'predicate_score': 0.642058789730072, 'object': 'Conrad (1964)', 'path': ['found', 'found'], 'hops': 2, 'total_score': np.float32(1.4679174)}]
[{'subject': 'Co

------

# Case 2 month/year  we need to loop over all days and call search function (some months are 28 days others are 31 )

In [None]:
print("Predicted anwer using temporal graph rag : ",answer_user_question("How popular is poultry as a consumable among humans in March 2018 ?",Manual_Graph))
print("ground truth answer : Poultry is the second most widely eaten type of meat globally")

Other
subject  poultry
verb  is
date  {'normalized_date': '03/16/2018', 'type': 0, 'original': '16 March 2018'}
03/16/2018
triplet:  [{'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'second most widely eaten type of meat globally', 'path': None, 'hops': 0, 'total_score': np.float32(1.9999999)}, {'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'second most widely eaten type of meat globally', 'path': None, 'hops': 0, 'total_score': np.float32(1.9999999)}, {'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'second most widely eaten type of meat globally', 'path': ['is'], 'hops': 1, 'total_score': np.float32(1.9999999)}, {'subject': 'Poultry', 'subject_score': np.float32(0.9999999), 'predicate': 'is', 'predicate_score': 1.0, 'object': 'Poultry', 'path': ['is', 'is'], 'hops': 2, 'total_score':

In [None]:
print("Predicted anwer using temporal graph rag : ",answer_user_question("How had the Census Bureau changed its collection of data in March 2018 ?",Manual_Graph))
print("ground truth answer : The Census Bureau changed its data collection by allowing people to self-identify as more than one ethnicity")

Other
subject  the Census Bureau
verb  changed its collection of data
date  {'normalized_date': '03/03/2018', 'type': 0, 'original': '3 March 2018'}
03/03/2018
triplet:  [{'subject': 'Census Bureau', 'subject_score': np.float32(0.97950697), 'predicate': 'changed', 'predicate_score': 0.4662553071975708, 'object': 'its data collection', 'path': None, 'hops': 0, 'total_score': np.float32(1.4457623)}]
[{'subject': 'Census Bureau', 'subject_score': np.float32(0.97950697), 'predicate': 'changed', 'predicate_score': 0.4662553071975708, 'object': 'its data collection', 'path': None, 'hops': 0, 'total_score': np.float32(1.4457623)}]
Predicted anwer using temporal graph rag :  The Census Bureau changed its data collection.
ground truth answer : The Census Bureau changed its data collection by allowing people to self-identify as more than one ethnicity


In [44]:
print("Predicted anwer using temporal graph rag : ",answer_user_question('What did conrad find about test subjects in March 2018?',Manual_Graph))
print("ground truth answer : subjects had more difficulty recalling collections of letters that were acoustically similar")

Other
subject  conrad
verb  find
date  {'normalized_date': '03/01/2018', 'type': 1, 'original': 'March 2018'}
{'subject': 'Conrad (1964)', 'subject_score': np.float32(0.82585865), 'predicate': 'found', 'predicate_score': 0.642058789730072, 'object': 'test subjects had more difficulty recalling collections of letters that were acoustically similar', 'path': None, 'hops': 0, 'total_score': np.float32(1.4679174)}
Predicted anwer using temporal graph rag :  Conrad found test subjects had more difficulty recalling collections of letters that were acoustically similar.
ground truth answer : subjects had more difficulty recalling collections of letters that were acoustically similar


---

# Case 3 just year  --> do semantic search using neo4j

In [60]:
answer_user_question("How popular is poultry as a consumable among humans in 2018?",Manual_Graph)

Other
subject  poultry
verb  is popular
date  {'normalized_date': '01/01/2018', 'type': 3, 'original': '2018'}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'According to the provided verified information, poultry is the second most widely eaten type of meat globally, implying a high level of consumption among humans. No specific data or statistics for 2018 are available in the provided knowledge graph.'

In [61]:
answer_user_question("How had the Census Bureau changed its collection of data in 2018?",Manual_Graph)

Other
subject  the Census Bureau
verb  changed its collection of data
date  {'normalized_date': '01/01/2018', 'type': 3, 'original': '2018'}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'According to the provided knowledge graph, the answer to the query is:\n\nThe Census Bureau changed its data collection by allowing people to self-identify as more than one ethnicity.\n\nThis information is verified and extracted from the knowledge graph.'

In [64]:
answer_user_question("What did conrad find about test subjects in 2018?",Manual_Graph)

Other
subject  conrad
verb  find
date  {'normalized_date': '01/01/2018', 'type': 3, 'original': '2018'}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

"I don't have verified information about this.\n\nThe provided knowledge graph only contains information about a 1964 study, where Conrad found that test subjects had more difficulty recalling collections of letters that were acoustically similar. There is no information about Conrad's findings in 2018."

---

# Case 4 no date at all 

In [65]:
answer_user_question("How popular is poultry as a consumable among humans?",Manual_Graph)

Other
subject  poultry
verb  is
date  {'normalized_date': None, 'type': 4, 'original': None}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'According to our verified knowledge graph, poultry is the "second most widely eaten type of meat globally". This suggests that poultry is a very popular consumable among humans.'

In [66]:
answer_user_question("How had the Census Bureau changed its collection of data?",Manual_Graph)

Other
subject  the Census Bureau
verb  changed its collection of data
date  {'normalized_date': None, 'type': 4, 'original': None}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Based on the provided knowledge graph, the Census Bureau has changed its collection of data in the following ways:\n\n* The Census Bureau changed its data collection by allowing people to self-identify as more than one ethnicity.\n* The Census 2000 asked about race differently than previous censuses.\n* The Census Bureau included more than a dozen ethnic/racial categories on the census by 1990.\n* The Census Bureau changed its data collection, and as a result, the data from the Census 2000 is not directly comparable with data from the 1990 census or earlier censuses.\n\nThese changes reflect changing social ideas about ethnicity and a wide variety of immigrants who came to the United States.'

In [67]:
answer_user_question("What did conrad find about test subjects?",Manual_Graph)

Other
subject  conrad
verb  find about
date  {'normalized_date': None, 'type': 4, 'original': None}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'According to the provided knowledge graph, Conrad found that test subjects had more difficulty recalling collections of letters that were acoustically similar.'

------

# Calculating Metrics 

In [None]:
def evaluate_rag(dataset_path, answer_query_func):
    """
    Evaluates a RAG model using a given dataset.

    Args:
        dataset_path (str): Path to the CSV dataset file.
        answer_query_func (function): The function that takes a query string
                                       and returns a generated answer string.

    Returns:
        dict: A dictionary containing the computed evaluation scores.
    """
    print(f"Loading dataset: {dataset_path}")
    try:
        df = pd.read_csv(dataset_path)
    except FileNotFoundError:
        print(f"Error: Dataset not found at {dataset_path}")
        return None

    # Determine column names dynamically
    question_col = None
    answer_col = None

    if 'question' in df.columns and 'answer' in df.columns:
        question_col = 'question'
        answer_col = 'answer'
    elif 'Question' in df.columns and 'Answer' in df.columns:
        question_col = 'Question'
        answer_col = 'Answer'
    else:
        print(f"Error: Could not find 'question'/'answer' or 'Question'/'Answer' columns in {dataset_path}")
        print(f"Available columns: {df.columns.tolist()}")
        return None

    generated_answers = []
    ground_truth_answers = []
    questions = []
    x = 0
    print(f"Processing {len(df)} questions...")
    # Iterate through the dataset and generate answers
    for index, row in df.iterrows():
        question = row[question_col]
        ground_truth = str(row[answer_col]) # Ensure ground truth is string
        questions.append(question)
        ground_truth_answers.append(ground_truth)

        # Generate answer using the provided function
        try:
            time.sleep(2)
            generated_answer = str(answer_query_func(question , sohailo)) # Ensure generated answer is string
            generated_answers.append(generated_answer)
        except Exception as e:
            print(f"Error generating answer for question: {question[:50]}... - {e}")
            generated_answers.append("") # Append empty string on error
        x = x+1
        if(x > 40):
            break


    # --- Compute Evaluation Metrics ---

    # BLEU-4 Score
    # BLEU requires tokenized sentences
    # Smoothing function is used to handle cases where n-grams are not found
    df = pd.DataFrame({
    "Question": questions,
    "Generated Answer": generated_answers,
    "Ground Truth": ground_truth_answers
    })
    
    # Save to CSV
    df.to_csv("qa_results.csv", index=False)
    
    print("Saved to qa_results.csv")
    smoothie = SmoothingFunction().method4
    bleu4_scores = []
    for ref, gen in zip(ground_truth_answers, generated_answers):
        # BLEU expects a list of reference sentences (even if only one)
        reference_tokens = [nltk.word_tokenize(ref)]
        candidate_tokens = nltk.word_tokenize(gen)
        score = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
        bleu4_scores.append(score)

    average_bleu4 = sum(bleu4_scores) / len(bleu4_scores) if bleu4_scores else 0


    # ROUGE-L Score
    # ROUGE expects strings
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rougeL_scores = []
    for ref, gen in zip(ground_truth_answers, generated_answers):
        scores = scorer.score(ref, gen)
        rougeL_scores.append(scores['rougeL'].fmeasure) # Using f-measure for ROUGE-L

    average_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0

    # You can add other metrics here if needed

    evaluation_results = {
        "Average BLEU-4": average_bleu4,
        "Average ROUGE-L (F-measure)": average_rougeL,
    }

    return evaluation_results

In [None]:
dataset_path_1 = '/kaggle/input/sohail-test-data/all_qa.csv'
print("--- Evaluating Dataset 1 ---")
results_dataset1 = evaluate_rag(dataset_path_1, answer_user_question)
if results_dataset1:
    print("\n--- Results for /kaggle/input/sohail-test-data/all_qa.csv ---")
    for metric, score in results_dataset1.items():
        print(f"{metric}: {score:.4f}")
else:
    print("\nEvaluation failed for /kaggle/input/sohail-test-data/all_qa.csv")

In [None]:
dataset_path_2 = '/kaggle/input/yara-qa/test_data_2.csv'
print("\n--- Evaluating Dataset 2 ---")
results_dataset2 = evaluate_rag(dataset_path_2, answer_user_question)

if results_dataset2:
    print("\n--- Results for /kaggle/input/yara-qa/test_data_2.csv ---")
    for metric, score in results_dataset2.items():
        print(f"{metric}: {score:.4f}")
else:
     print("\nEvaluation failed for /kaggle/input/yara-qa/test_data_2.csv")

------