### 🧠 Multilingual Insurance Document Translator
#### Goal: Translate a single English insurance document (PDF) into multiple languages in one go.

✅ Features

📄 Extract text from PDF.

🌍 Select multiple target languages.

🤖 Translate using Hugging Face Transformers.

📝 Generate one translated PDF per language.


#### 1. Install Required Packages

In [1]:
pip install transformers torch sentencepiece fpdf pymupdf


Note: you may need to restart the kernel to use updated packages.


#### 2. Script to Translate English PDF to Multiple Languages


In [2]:
import os
import fitz  # PyMuPDF
from fpdf import FPDF
import textwrap
from googletrans import Translator

# Constants
FONT_PATH = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSans-Regular.ttf"
FONT_FAMILY = "NotoSans"
DEFAULT_LANGUAGE = "Hindi"

# Define supported languages with Google Translate codes
SUPPORTED_LANGUAGES = {
    "Hindi": "hi",
    "Tamil": "ta",
    "Telugu": "te",
    "Gujarati": "gu",
    "Kannada": "kn",
    "Bengali": "bn",
    "Punjabi": "pa",
    "Marathi": "mr",
    "Malayalam": "ml",
    "Urdu": "ur",
    "Odia": "or"
}

# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

# Step 2: Translate text to multiple languages
def translate_text(text, target_languages):
    translator = Translator()
    translations = {}
    for language in target_languages:
        try:
            lang_code = SUPPORTED_LANGUAGES[language]
            print(f"🔁 Translating to {language}...")
            translated = translator.translate(text, dest=lang_code)
            translations[language] = translated.text
        except Exception as e:
            print(f"❌ Error translating to {language}: {e}")
    return translations

# Step 3: PDF generator that preserves alignment
class UnicodePDF(FPDF):
    def __init__(self, font_path, font_family, title=""):
        super().__init__()
        if not os.path.isfile(font_path):
            raise RuntimeError(f"❌ Font file not found: {font_path}")
        self.font_family = font_family
        self.title = title  # Set title before add_page
        self.add_font(font_family, '', font_path, uni=True)
        self.add_page()
        self.set_font(font_family, '', 12)
        self.set_left_margin(10)
        self.set_right_margin(10)

    def header(self):
        self.set_font(self.font_family, '', 16)
        self.cell(0, 10, self.title, ln=True, align='C')
        self.ln(10)
        self.set_font(self.font_family, '', 12)

    def add_multiline_text(self, text):
        for paragraph in text.split("\n"):
            wrapped = textwrap.fill(paragraph.strip(), width=90)
            self.multi_cell(0, 8, wrapped)
            self.ln(2)

    def save(self, output_path):
        self.output(output_path)

# Step 4: Create the translated PDF
def create_unicode_pdf(translated_text, language, output_path):
    try:
        pdf = UnicodePDF(FONT_PATH, FONT_FAMILY, f"Translated Insurance Policy - {language}")
        pdf.add_multiline_text(translated_text)
        pdf.save(output_path)
        print(f"✅ PDF saved for {language}: {output_path}")
    except RuntimeError as e:
        print(e)

# Step 5: Orchestrate the process
def main(pdf_path, selected_languages):
    text = extract_text_from_pdf(pdf_path)
    translations = translate_text(text, selected_languages)
    for language, translated_text in translations.items():
        output_path = f"Translated_Insurance_Policy_{language}.pdf"
        create_unicode_pdf(translated_text, language, output_path)

# Step 6: Example usage
if __name__ == "__main__":
    selected_languages = ["Hindi", "Tamil", "Telugu"] 
    valid_languages = [lang for lang in selected_languages if lang in SUPPORTED_LANGUAGES]

    if valid_languages:
        pdf_path = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/Dataset/Health_Insurance_Policy.pdf"
        main(pdf_path, valid_languages)
    else:
        print("❌ No valid languages selected.")


🔁 Translating to Hindi...
🔁 Translating to Tamil...
🔁 Translating to Telugu...
✅ PDF saved for Hindi: Translated_Insurance_Policy_Hindi.pdf
✅ PDF saved for Tamil: Translated_Insurance_Policy_Tamil.pdf
✅ PDF saved for Telugu: Translated_Insurance_Policy_Telugu.pdf


#### Summarizer

In [3]:
from transformers import pipeline
from googletrans import Translator
import PyPDF2

# Initialize translation and summarization tools
translator = Translator()
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Step 1: Extract text from translated language PDF
def extract_pdf_text(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        full_text = ""
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text
        return full_text

# Step 2: Translate to English
def translate_to_english(text):
    translated = translator.translate(text, dest='en')
    return translated.text

# Step 3: Summarize in English
def summarize_text(text):
    max_chunk = 1000
    summary = ""
    for i in range(0, len(text), max_chunk):
        chunk = text[i:i+max_chunk]
        result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
        summary += result[0]['summary_text'] + "\n"
    return summary

# Step 4: Optional - Translate summary back to original language (e.g., Hindi)
def translate_back_to_language(text, dest_lang='hi'):
    translated = translator.translate(text, dest=dest_lang)
    return translated.text

# Full pipeline
def summarize_translated_pdf(pdf_path, original_lang_code='hi'):
    original_text = extract_pdf_text(pdf_path)
    print("🔍 Original Language Detected Text:\n", original_text[:300])  # Preview

    english_text = translate_to_english(original_text)
    english_summary = summarize_text(english_text)

    final_summary = translate_back_to_language(english_summary, dest_lang=original_lang_code)
    return final_summary

# Run on Hindi translated PDF
pdf_path = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/Translated_Insurance_Policy_Hindi.pdf"
summary = summarize_translated_pdf(pdf_path, original_lang_code='hi')

print("\n📄 हिंदी सारांश:\n", summary)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


FileNotFoundError: [Errno 2] No such file or directory: 'D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/Translated_Insurance_Policy_Hindi.pdf'

#### BLEU Score

In [4]:
import os
import fitz  # PyMuPDF
from fpdf import FPDF
import textwrap
from googletrans import Translator
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk

# Download required tokenizer
nltk.download('punkt')

# Constants
FONT_PATH = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSans-Regular.ttf"
FONT_FAMILY = "NotoSans"
DEFAULT_LANGUAGE = "Hindi"

# Supported languages
SUPPORTED_LANGUAGES = {
    "Hindi": "hi",
    "Tamil": "ta",
    "Telugu": "te",
    "Gujarati": "gu",
    "Kannada": "kn",
    "Bengali": "bn",
    "Punjabi": "pa",
    "Marathi": "mr",
    "Malayalam": "ml",
    "Urdu": "ur",
    "Odia": "or"
}

# Reference translations for BLEU score evaluation
REFERENCE_TRANSLATIONS = {
    "Hindi": "यह एक स्वास्थ्य बीमा पॉलिसी है।",
    "Tamil": "இது ஒரு சுகாதார காப்பீட்டு கொள்கை.",
    "Telugu": "ఇది ఆరోగ్య బీమా పాలసీ."
    # Add more human reference translations here as needed
}

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

# Step 2: Translate text
def translate_text(text, target_languages):
    translator = Translator()
    translations = {}
    for language in target_languages:
        try:
            lang_code = SUPPORTED_LANGUAGES[language]
            print(f"🔁 Translating to {language}...")
            translated = translator.translate(text, dest=lang_code)
            translations[language] = translated.text
        except Exception as e:
            print(f"❌ Error translating to {language}: {e}")
    return translations

# Step 3: Unicode PDF generator
class UnicodePDF(FPDF):
    def __init__(self, font_path, font_family, title=""):
        super().__init__()
        if not os.path.isfile(font_path):
            raise RuntimeError(f"❌ Font file not found: {font_path}")
        self.font_family = font_family
        self.title = title
        self.add_font(font_family, '', font_path, uni=True)
        self.add_page()
        self.set_font(font_family, '', 12)
        self.set_left_margin(10)
        self.set_right_margin(10)

    def header(self):
        self.set_font(self.font_family, '', 16)
        self.cell(0, 10, self.title, ln=True, align='C')
        self.ln(10)
        self.set_font(self.font_family, '', 12)

    def add_multiline_text(self, text):
        for paragraph in text.split("\n"):
            wrapped = textwrap.fill(paragraph.strip(), width=90)
            self.multi_cell(0, 8, wrapped)
            self.ln(2)

    def save(self, output_path):
        self.output(output_path)

# Step 4: Save translated PDF
def create_unicode_pdf(translated_text, language, output_path):
    try:
        pdf = UnicodePDF(FONT_PATH, FONT_FAMILY, f"Translated Insurance Policy - {language}")
        pdf.add_multiline_text(translated_text)
        pdf.save(output_path)
        print(f"✅ PDF saved for {language}: {output_path}")
    except RuntimeError as e:
        print(e)

# Step 5: Compute BLEU Score
def compute_bleu(candidate_text, reference_text):
    candidate_tokens = word_tokenize(candidate_text)
    reference_tokens = [word_tokenize(reference_text)]
    score = sentence_bleu(reference_tokens, candidate_tokens,
                          weights=(0.25, 0.25, 0.25, 0.25),
                          smoothing_function=SmoothingFunction().method4)
    return score

# Step 6: Main orchestrator
def main(pdf_path, selected_languages):
    text = extract_text_from_pdf(pdf_path)
    translations = translate_text(text, selected_languages)
    
    for language, translated_text in translations.items():
        output_path = f"Translated_Insurance_Policy_{language}.pdf"
        create_unicode_pdf(translated_text, language, output_path)

        # BLEU score evaluation
        reference_text = REFERENCE_TRANSLATIONS.get(language)
        if reference_text:
            bleu_score = compute_bleu(translated_text, reference_text)
            print(f"🟦 BLEU Score for {language}: {bleu_score:.4f}")
        else:
            print(f"⚠️ No reference translation found for {language}. Skipping BLEU score.")

# Step 7: Run the script
if __name__ == "__main__":
    selected_languages = ["Hindi", "Tamil", "Telugu"]  # Choose from SUPPORTED_LANGUAGES
    valid_languages = [lang for lang in selected_languages if lang in SUPPORTED_LANGUAGES]

    if valid_languages:
        pdf_path = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/Dataset/Health_Insurance_Policy.pdf"
        main(pdf_path, valid_languages)
    else:
        print("❌ No valid languages selected.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sures\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔁 Translating to Hindi...
🔁 Translating to Tamil...
🔁 Translating to Telugu...
✅ PDF saved for Hindi: Translated_Insurance_Policy_Hindi.pdf
🟦 BLEU Score for Hindi: 0.0022
✅ PDF saved for Tamil: Translated_Insurance_Policy_Tamil.pdf
🟦 BLEU Score for Tamil: 0.0027
✅ PDF saved for Telugu: Translated_Insurance_Policy_Telugu.pdf
🟦 BLEU Score for Telugu: 0.0026


In [7]:
import os
import shutil
import tempfile
import PyPDF2
import re
from googletrans import Translator
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from reportlab.lib.pagesizes import A4
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, KeepTogether
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_JUSTIFY
from reportlab.lib.units import inch

import nltk
nltk.download('punkt')

# ========== Font Mapping ==========
language_fonts = {
    "en": "fonts/NotoSans-Regular.ttf",
    "hi": "fonts/NotoSansDevanagari-Regular.ttf",
    "ta": "fonts/NotoSansTamil-Regular.ttf",
    "te": "fonts/NotoSansTelugu-Regular.ttf"
}

# ========== BLEU Reference Translations ==========
REFERENCE_TRANSLATIONS = {
    "Hindi": "यह एक स्वास्थ्य बीमा पॉलिसी है।",
    "Tamil": "இது ஒரு சுகாதார காப்பீட்டு கொள்கை.",
    "Telugu": "ఇది ఆరోగ్య బీమా పాలసీ."
}

# ========== Extract Text ==========
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        return " ".join(page.extract_text() or "" for page in reader.pages)

# ========== Translate ==========
def translate_text(text, lang_code):
    translator = Translator()
    return translator.translate(text, dest=lang_code).text

# ========== Summarize ==========
def summarize_text(text, sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return " ".join(str(sentence) for sentence in summary)

# ========== BLEU Score ==========
def compute_bleu(candidate, reference):
    candidate_tokens = word_tokenize(candidate)
    reference_tokens = [word_tokenize(reference)]
    return sentence_bleu(reference_tokens, candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25),
                         smoothing_function=SmoothingFunction().method4)

# ========== PDF Generation ==========
def generate_pdf(text, font_path, output_path, title="Translated Insurance Policy"):
    font_name = os.path.splitext(os.path.basename(font_path))[0]
    pdfmetrics.registerFont(TTFont(font_name, font_path))

    doc = SimpleDocTemplate(output_path, pagesize=A4,
                            rightMargin=40, leftMargin=40,
                            topMargin=60, bottomMargin=40)

    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(name='Justify', fontName=font_name, fontSize=12, leading=16,
                              alignment=TA_JUSTIFY, spaceAfter=10))

    elements = [Paragraph(title, ParagraphStyle(name='Title', fontName=font_name, fontSize=16,
                                                alignment=1, spaceAfter=20))]

    normalized_text = re.sub(r'\n+', '\n', text)
    normalized_text = re.sub(r'\s{2,}', ' ', normalized_text)
    paragraphs = [p.strip() for p in normalized_text.split('\n') if len(p.strip()) > 20]

    for para in paragraphs:
        elements.append(KeepTogether([Paragraph(para, styles['Justify']), Spacer(1, 0.15 * inch)]))

    doc.build(elements)

# ========== Main Logic ==========
def process_policy(pdf_path, selected_languages):
    if not os.path.exists(pdf_path):
        print("❌ PDF not found.")
        return

    print("📄 Extracting text...")
    extracted_text = extract_text_from_pdf(pdf_path)

    print("🧠 Summarizing text...")
    english_summary = summarize_text(extracted_text)

    print("\n🔤 Summary in English:\n", english_summary)

    for lang_name in selected_languages:
        lang_code = {
            "Hindi": "hi",
            "Tamil": "ta",
            "Telugu": "te"
        }.get(lang_name)

        if not lang_code:
            print(f"⚠️ Language '{lang_name}' not supported.")
            continue

        print(f"\n🌐 Translating to {lang_name}...")

        translated_text = translate_text(extracted_text, lang_code)
        translated_summary = translate_text(english_summary, lang_code)
        full_translated_content = f"{translated_summary}\n\n{translated_text}"

        # BLEU score (if reference available)
        ref = REFERENCE_TRANSLATIONS.get(lang_name)
        if ref:
            bleu = compute_bleu(translated_text, ref)
            print(f"🔵 BLEU Score for {lang_name}: {bleu:.4f}")
        else:
            print(f"ℹ️ No reference translation available for {lang_name}, skipping BLEU.")

        font_path = language_fonts[lang_code]
        output_pdf = f"Translated_Insurance_Policy_{lang_name}.pdf"
        generate_pdf(full_translated_content, font_path, output_pdf, title=f"Insurance Policy - {lang_name}")
        print(f"✅ PDF saved: {output_pdf}")

# ========== Run ==========
if __name__ == "__main__":
    # Example
    policy_path = "Dataset/Health_Insurance_Policy.pdf"


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sures\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
import os
import re
import shutil
import tempfile
import PyPDF2
from googletrans import Translator
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from reportlab.lib.pagesizes import A4
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, KeepTogether
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_JUSTIFY
from reportlab.lib.units import inch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt')

# Font mapping per language code
language_fonts = {
    "en": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSans-Regular.ttf",
    "ta": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "hi": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansDevanagari-Regular.ttf",
    "te": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTelugu-Regular.ttf",
    "kn": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansKannada-Regular.ttf",
    "ml": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansMalayalam-Regular.ttf",
    "bn": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansBengali-Regular.ttf",
    "gu": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansGujarati-Regular.ttf",
    "ur": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansArabic-Regular.ttf"
}

# Reference translations for BLEU evaluation
REFERENCE_TRANSLATIONS = {
    "Hindi": "यह एक स्वास्थ्य बीमा पॉलिसी है।",
    "Tamil": "இது ஒரு சுகாதார காப்பீட்டுத் திட்டம் பற்றிய ஆவணமாகும்.",
    "Telugu": "ఇది ఆరో


SyntaxError: unterminated string literal (detected at line 41) (3150045508.py, line 41)

In [9]:
import os
import re
import shutil
import tempfile
import PyPDF2
from googletrans import Translator
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from reportlab.lib.pagesizes import A4
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, KeepTogether
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_JUSTIFY
from reportlab.lib.units import inch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt')

# Font mapping per language code
language_fonts = {
    "en": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSans-Regular.ttf",
    "ta": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "hi": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansDevanagari-Regular.ttf",
    "te": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTelugu-Regular.ttf",
    "kn": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansKannada-Regular.ttf",
    "ml": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansMalayalam-Regular.ttf",
    "bn": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansBengali-Regular.ttf",
    "gu": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansGujarati-Regular.ttf",
    "ur": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansArabic-Regular.ttf"
}

# Reference translations for BLEU evaluation
REFERENCE_TRANSLATIONS = {
    "Hindi": "यह एक स्वास्थ्य बीमा पॉलिसी है।",
    "Tamil": "இது ஒரு சுகாதார காப்பீட்டுத் திட்டம் பற்றிய ஆவணமாகும்.",
    "Telugu": "ఇది ఆరోగ్య బీమా పాలసీకి సంబంధించిన డాక్యుమెంట్."
}

language_map = {
    "English": "en",
    "Tamil": "ta",
    "Hindi": "hi",
    "Telugu": "te",
    "Kannada": "kn",
    "Malayalam": "ml",
    "Bengali": "bn",
    "Gujarati": "gu",
    "Urdu": "ur",
}

# Extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ''.join(page.extract_text() or "" for page in reader.pages)
    return text

# Translate text
def translate_text(text, dest_language):
    translator = Translator()
    return translator.translate(text, dest=dest_language).text

# Summarize text
def summarize_text(text, language="english", sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join(str(sentence) for sentence in summary)

# Generate PDF
def generate_translated_pdf(text, font_path, output_path, lang_title="Translated Insurance Policy"):
    font_name = os.path.splitext(os.path.basename(font_path))[0]
    pdfmetrics.registerFont(TTFont(font_name, font_path))

    doc = SimpleDocTemplate(output_path, pagesize=A4,
                            rightMargin=40, leftMargin=40, topMargin=60, bottomMargin=40)

    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(name='Justify', fontName=font_name, fontSize=12,
                              leading=16, alignment=TA_JUSTIFY, spaceAfter=10))

    elements = []

    title_style = ParagraphStyle(name='Title', fontName=font_name, fontSize=16, alignment=1, spaceAfter=20)
    elements.append(Paragraph(lang_title, title_style))

    normalized_text = re.sub(r'\n+', '\n', text)
    normalized_text = re.sub(r'\s{2,}', ' ', normalized_text)
    paragraphs = [p.strip() for p in normalized_text.split('\n') if len(p.strip()) > 20]

    for para in paragraphs:
        elements.append(KeepTogether([
            Paragraph(para, styles['Justify']),
            Spacer(1, 0.15 * inch)
        ]))

    doc.build(elements)

# Compute BLEU score
def compute_bleu(candidate, reference):
    reference_tokens = word_tokenize(reference)
    candidate_tokens = word_tokenize(candidate)
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothie)

# Main processing function
def process_policy_pdf(pdf_path, selected_languages):
    extracted_text = extract_text_from_pdf(pdf_path)
    print("\n📝 English Summary:")
    english_summary = summarize_text(extracted_text)
    print(english_summary)

    for lang_name in selected_languages:
        lang_code = language_map.get(lang_name)
        if not lang_code:
            print(f"\n❌ Skipping unsupported language: {lang_name}")
            continue

        print(f"\n🔁 Translating to {lang_name}...")
        translated_text = translate_text(extracted_text, lang_code)
        translated_summary = translate_text(english_summary, lang_code)
        full_translated = f"{translated_summary}\n\n{translated_text}"

        font_path = language_fonts.get(lang_code, language_fonts['en'])
        os.makedirs("Translated_Policies", exist_ok=True)
        output_path = os.path.join("Translated_Policies", f"Translated_Insurance_Policy_{lang_name}.pdf")
        generate_translated_pdf(full_translated, font_path, output_path, f"Translated Insurance Policy ({lang_name})")

        print(f"✅ Saved PDF: {output_path}")

        # BLEU score
        ref = REFERENCE_TRANSLATIONS.get(lang_name)
        if ref:
            bleu = compute_bleu(translated_summary, ref)
            print(f"🔵 BLEU Score for {lang_name}: {bleu:.4f}")
        else:
            print(f"⚠️ No reference translation for BLEU in {lang_name}.")

# Run example
if __name__ == "__main__":
    sample_pdf = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/Dataset/Health_Insurance_Policy.pdf"
    selected_languages = ["Hindi", "Tamil", "Telugu"]
    process_policy_pdf(sample_pdf, selected_languages)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sures\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



📝 English Summary:
Coverage Summary This policy covers the following medical expenses: - Hospitalization (minimum 24 hours) - Pre-Hospitalization (30 days prior) - Post-Hospitalization (60 days post) - Daycare procedures (up to 500 listed procedures) - Emergency Ambulance (up to Rs.2,000 per hospitalization) - COVID-19 Treatment 3. Submit the following documents: - Discharge summary - Hospital bills - Doctor's prescription - ID proof 3. Renewal Terms - Policy must be renewed annually to avoid a lapse in coverage.

🔁 Translating to Hindi...
✅ Saved PDF: Translated_Policies\Translated_Insurance_Policy_Hindi.pdf
🔵 BLEU Score for Hindi: 0.0051

🔁 Translating to Tamil...
✅ Saved PDF: Translated_Policies\Translated_Insurance_Policy_Tamil.pdf
🔵 BLEU Score for Tamil: 0.0060

🔁 Translating to Telugu...
✅ Saved PDF: Translated_Policies\Translated_Insurance_Policy_Telugu.pdf
🔵 BLEU Score for Telugu: 0.0056


In [12]:
import os
import re
import shutil
import tempfile
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from googletrans import Translator
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, KeepTogether
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_JUSTIFY
from reportlab.lib.units import inch

nltk.download('punkt')

# Fonts
language_fonts = {
    "en": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "hi": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "ta": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "te": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
}

language_map = {
    "English": "en",
    "Hindi": "hi",
    "Tamil": "ta",
    "Telugu": "te",
}

# Step 1: Extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        pdf = PdfReader(f)
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

# Step 2: Summarize text
def summarize_text(text, language="english", sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return " ".join(str(sentence) for sentence in summary)

# Step 3: Translate text
def translate_text(text, target_lang):
    translator = Translator()
    return translator.translate(text, dest=target_lang).text

# Step 4: Back-translate to English
def back_translate(text, src_lang):
    translator = Translator()
    return translator.translate(text, src='auto', dest='en').text

# Step 5: Generate aligned PDF
def generate_pdf(text, font_path, output_path, title="Translated Insurance Policy"):
    font_name = os.path.splitext(os.path.basename(font_path))[0]
    pdfmetrics.registerFont(TTFont(font_name, font_path))
    
    doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=40, leftMargin=40, topMargin=60, bottomMargin=40)
    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(
        name='Justify',
        fontName=font_name,
        fontSize=12,
        leading=16,
        alignment=TA_JUSTIFY,
        spaceAfter=10
    ))

    elements = []
    elements.append(Paragraph(title, ParagraphStyle(name='Title', fontName=font_name, fontSize=16, alignment=1, spaceAfter=20)))
    
    clean_text = re.sub(r'\n+', '\n', text)
    paragraphs = [p.strip() for p in clean_text.split('\n') if len(p.strip()) > 20]
    
    for para in paragraphs:
        elements.append(KeepTogether([Paragraph(para, styles['Justify']), Spacer(1, 0.15 * inch)]))
    
    doc.build(elements)

# Step 6: Compute BLEU Score
def compute_bleu(reference_text, translated_back_text):
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    translated_tokens = nltk.word_tokenize(translated_back_text.lower())
    smoothie = SmoothingFunction().method4
    score = sentence_bleu([reference_tokens], translated_tokens, smoothing_function=smoothie)
    return score

# Main Process
def main():
    input_pdf = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/Dataset/Health_Insurance_Policy.pdf"
    selected_languages = ["Hindi", "Tamil", "Telugu"]

    print("🔍 Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(input_pdf)

    print("\n📝 English Summary:")
    summary = summarize_text(extracted_text)
    print(summary)

    for lang in selected_languages:
        lang_code = language_map[lang]
        font_path = language_fonts[lang_code]

        print(f"\n🔁 Translating to {lang}...")
        translated = translate_text(summary, lang_code)
        translated_pdf_path = f"Translated_Policies/Translated_Insurance_Policy_{lang}.pdf"
        os.makedirs("Translated_Policies", exist_ok=True)

        full_text = f"{translated}\n\n{extracted_text}"
        generate_pdf(full_text, font_path, translated_pdf_path, title=f"Translated Insurance Policy ({lang})")
        print(f"✅ Saved PDF: {translated_pdf_path}")

        # Back-translate and calculate BLEU
        translated_back = back_translate(translated, lang_code)
        bleu = compute_bleu(summary, translated_back)
        print(f"🔵 BLEU Score for {lang}: {bleu:.4f}")

if __name__ == "__main__":
    main()



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sures\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔍 Extracting text from PDF...

📝 English Summary:
Coverage Summary This policy covers the following medical expenses: - Hospitalization (minimum 24 hours) - Pre-Hospitalization (30 days prior) - Post-Hospitalization (60 days post) - Daycare procedures (up to 500 listed procedures) - Emergency Ambulance (up to Rs.2,000 per hospitalization) - COVID-19 Treatment 3. Submit the following documents: - Discharge summary - Hospital bills - Doctor's prescription - ID proof 3. Renewal Terms - Policy must be renewed annually to avoid a lapse in coverage.

🔁 Translating to Hindi...
✅ Saved PDF: Translated_Policies/Translated_Insurance_Policy_Hindi.pdf
🔵 BLEU Score for Hindi: 0.3292

🔁 Translating to Tamil...
✅ Saved PDF: Translated_Policies/Translated_Insurance_Policy_Tamil.pdf
🔵 BLEU Score for Tamil: 0.2734

🔁 Translating to Telugu...
✅ Saved PDF: Translated_Policies/Translated_Insurance_Policy_Telugu.pdf
🔵 BLEU Score for Telugu: 0.2416


#### ROUGE score

In [13]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Downloading absl_py-2.2.2-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (pyproject.toml): started
  Building wheel for rouge_score (pyproject.toml): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=25027 sha256=1e711cb1b7cd181778527015cd3faa4d1ce8ed439db89857a7a28497dde4f405
  Stored in directory: c:\users\sures\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef54687

In [16]:
import os
import re
import shutil
import tempfile
from nltk.tokenize import word_tokenize
from googletrans import Translator
from rouge_score import rouge_scorer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, KeepTogether
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_JUSTIFY
from reportlab.lib.units import inch

# Fonts mapping
language_fonts = {
    "en": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "hi": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "ta": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
    "te": "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/fonts/NotoSansTamil-Regular.ttf",
}

# Language code mapping
language_map = {
    "English": "en",
    "Hindi": "hi",
    "Tamil": "ta",
    "Telugu": "te",
}

# Step 1: Extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        pdf = PdfReader(f)
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

# Step 2: Summarize text
def summarize_text(text, language="english", sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return " ".join(str(sentence) for sentence in summary)

# Step 3: Translate text
def translate_text(text, target_lang):
    translator = Translator()
    return translator.translate(text, dest=target_lang).text

# Step 4: Back-translate to English
def back_translate(text, src_lang):
    translator = Translator()
    return translator.translate(text, src='auto', dest='en').text

# Step 5: Generate PDF
def generate_pdf(text, font_path, output_path, title="Translated Insurance Policy"):
    font_name = os.path.splitext(os.path.basename(font_path))[0]
    pdfmetrics.registerFont(TTFont(font_name, font_path))
    
    doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=40, leftMargin=40, topMargin=60, bottomMargin=40)
    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(
        name='Justify',
        fontName=font_name,
        fontSize=12,
        leading=16,
        alignment=TA_JUSTIFY,
        spaceAfter=10
    ))

    elements = []
    elements.append(Paragraph(title, ParagraphStyle(name='Title', fontName=font_name, fontSize=16, alignment=1, spaceAfter=20)))
    
    clean_text = re.sub(r'\n+', '\n', text)
    paragraphs = [p.strip() for p in clean_text.split('\n') if len(p.strip()) > 20]
    
    for para in paragraphs:
        elements.append(KeepTogether([Paragraph(para, styles['Justify']), Spacer(1, 0.15 * inch)]))
    
    doc.build(elements)

# Step 6: Compute ROUGE Score
def compute_rouge(reference_text, translated_back_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, translated_back_text)
    return scores

# Main Process
def main():
    input_pdf = "D:/AI-Powered Intelligent Insurance Risk Assessment and Customer Insights System/Dataset/Health_Insurance_Policy.pdf"
    selected_languages = ["Hindi", "Tamil", "Telugu"]

    print("🔍 Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(input_pdf)

    print("\n📝 English Summary:")
    summary = summarize_text(extracted_text)
    print(summary)

    for lang in selected_languages:
        lang_code = language_map[lang]
        font_path = language_fonts[lang_code]

        print(f"\n🔁 Translating to {lang}...")
        translated = translate_text(summary, lang_code)
        translated_pdf_path = f"Translated_Policies/Translated_Insurance_Policy_{lang}.pdf"
        os.makedirs("Translated_Policies", exist_ok=True)

        full_text = f"{translated}\n\n{extracted_text}"
        generate_pdf(full_text, font_path, translated_pdf_path, title=f"Translated Insurance Policy ({lang})")
        print(f"✅ Saved PDF: {translated_pdf_path}")

        # Back-translate and calculate ROUGE
        translated_back = back_translate(translated, lang_code)
        rouge_scores = compute_rouge(summary, translated_back)
        
        print(f"🔵 ROUGE Score for {lang}:")
        print(f" ROUGE-1: {rouge_scores['rouge1']}")
        print(f" ROUGE-2: {rouge_scores['rouge2']}")
        print(f" ROUGE-L: {rouge_scores['rougeL']}")

if __name__ == "__main__":
    main()


🔍 Extracting text from PDF...

📝 English Summary:
Coverage Summary This policy covers the following medical expenses: - Hospitalization (minimum 24 hours) - Pre-Hospitalization (30 days prior) - Post-Hospitalization (60 days post) - Daycare procedures (up to 500 listed procedures) - Emergency Ambulance (up to Rs.2,000 per hospitalization) - COVID-19 Treatment 3. Submit the following documents: - Discharge summary - Hospital bills - Doctor's prescription - ID proof 3. Renewal Terms - Policy must be renewed annually to avoid a lapse in coverage.

🔁 Translating to Hindi...
✅ Saved PDF: Translated_Policies/Translated_Insurance_Policy_Hindi.pdf
🔵 ROUGE Score for Hindi:
 ROUGE-1: Score(precision=0.8095238095238095, recall=0.4857142857142857, fmeasure=0.6071428571428571)
 ROUGE-2: Score(precision=0.6341463414634146, recall=0.37681159420289856, fmeasure=0.4727272727272728)
 ROUGE-L: Score(precision=0.7857142857142857, recall=0.4714285714285714, fmeasure=0.5892857142857143)

🔁 Translating to Ta