In [None]:
# Install required packages
!pip install -q pymupdf scikit-learn gtts indic-nlp-library sentence-transformers

# Import libraries
import fitz  # PyMuPDF
import re
import os
from gtts import gTTS
from IPython.display import Audio, display, HTML
from indicnlp import loader
from indicnlp.tokenize import sentence_tokenize
from sentence_transformers import SentenceTransformer, util

# Setup Indic NLP
INDIC_RESOURCES_PATH = "/content/indic_nlp_resources"
if not os.path.exists(INDIC_RESOURCES_PATH):
    !git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git {INDIC_RESOURCES_PATH}
os.environ['INDIC_RESOURCES_PATH'] = INDIC_RESOURCES_PATH
loader.load()

# Helper functions
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "".join([page.get_text() for page in doc])

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\u0900-\u097F\s\.,\-]', '', text)
    return text.strip()

def split_sentences(text, lang='mr'):
    return sentence_tokenize.sentence_split(text, lang)

def answer_query(query):
    query = preprocess_text(query)
    if not query:
        return "कृपया वैध विचार करा."
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
    best_idx = cos_scores.argmax().item()
    if cos_scores[best_idx] < 0.4:
        return "माहिती सापडली नाही."
    response = sentences[best_idx]
    response = re.sub(r'अ\. 􀀗 \.', '', response)
    response = re.sub(r'\s+', ' ', response).strip()
    return response

def speak_marathi(text):
    # Convert text to speech (TTS)
    tts = gTTS(text=text, lang='mr')
    filename = "/content/response.mp3"
    tts.save(filename)

    # Provide a downloadable link for the MP3 file
    display(HTML(f'<a href="{filename}" download>Download the audio response</a>'))

    # Attempt to play the audio in the notebook (this may work in some environments)
    display(Audio(filename, autoplay=True))

# Load and process the PDF (upload first!)
pdf_path = "/content/Extra-Ordinary_CENTRAL-SECTION_Part-4-Marathi-.pdf"  # Change if needed
print("Extracting text...")
raw_text = extract_text_from_pdf(pdf_path)
clean_text = preprocess_text(raw_text)
sentences = split_sentences(clean_text)
print(f"Total sentences: {len(sentences)}")

# Load sentence transformer
print("Encoding sentences...")
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
sentence_embeddings = model.encode(sentences, convert_to_tensor=True)

# Interactive assistant with accuracy feedback
correct_count = 0
total_count = 0
print("\nमराठी व्हॉईस असिस्टंट तयार आहे! प्रश्न विचारा (बाहेर पडण्यासाठी 'exit' टाइप करा):")
while True:
    query = input("\nतुमचा प्रश्न: ")
    if query.strip().lower() == 'exit':
        print("कायमचा संपवला.")
        break
    response = answer_query(query)
    print("उत्तर:", response)
    speak_marathi(response)

    # Optional accuracy check (for manual verification)
    expected = input("✅ योग्य उत्तर काय असायला हवं होतं? (Enter द्या वगळण्यासाठी): ").strip()
    total_count += 1
    if expected:
        expected_clean = preprocess_text(expected)
        response_clean = preprocess_text(response)
        if expected_clean in response_clean:
            print("बरोबर उत्तर दिलं.")
            correct_count += 1
        else:
            print("चुकीचं उत्तर.")
            print(f"अपेक्षित: {expected}")
    else:
        print("पडताळणी वगळली.")

    accuracy = (correct_count / total_count) * 100
    print(f"संपूर्ण अचूकता: {accuracy:.2f}% ({correct_count}/{total_count})")


Extracting text...
Total sentences: 20
Encoding sentences...
