In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m204.8/232.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
!pip install transformers



In [4]:
!pip install gTTS


Collecting gTTS
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.3-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.3


In [5]:
import PyPDF2
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, M2M100ForConditionalGeneration, M2M100Tokenizer
from gtts import gTTS  # Import gTTS for text-to-speech
import os

# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_file_path):
    with open(pdf_file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text

# Step 2: Load the Legal Pegasus Model and Tokenizer
def load_pegasus_model():
    model_name = "nsi319/legal-pegasus"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    return tokenizer, model

# Step 3: Generate the Summary
def summarize_text(text, tokenizer, model, max_length=400):
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = model.generate(
        inputs.input_ids,
        max_length=512,
        num_beams=5,
        length_penalty=2.0,  # Penalize longer outputs
        repetition_penalty=2.5,  # Encourage diverse output
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Step 4: Load the M2M100 Model and Tokenizer for translation
def load_translation_model_m2m100(source_lang="en", target_lang="hi"):
    model_name = "facebook/m2m100_418M"
    tokenizer = M2M100Tokenizer.from_pretrained(model_name)
    model = M2M100ForConditionalGeneration.from_pretrained(model_name)

    # Set the language codes for the source and target languages
    tokenizer.src_lang = source_lang
    tokenizer.tgt_lang = target_lang

    return tokenizer, model

# Step 5: Translate the summarized text
def translate_text_m2m100(summary, source_lang="en", target_lang="hi"):
    tokenizer, model = load_translation_model_m2m100(source_lang, target_lang)

    # Tokenize the summary
    inputs = tokenizer(summary, return_tensors="pt", truncation=True)

    # Generate translation
    translated_ids = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(target_lang))

    # Decode the translated output
    translated_summary = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

    return translated_summary

# Step 6: Convert text to speech using gTTS
def text_to_speech(text, lang="en", file_name="output.mp3"):
    tts = gTTS(text=text, lang=lang, slow=False)
    tts.save(file_name)
    os.system(f"mpg321 {file_name}")  # You can replace this with another player if mpg321 is unavailable
    print(f"Audio saved as {file_name}")

# Step 7: Main function to generate both English and translated summary with TTS
def generate_legal_summary_with_translation_and_tts(pdf_file_path, target_language="hi"):
    # Extract text
    document_text = extract_text_from_pdf(pdf_file_path)

    # Load Legal Pegasus
    tokenizer, model = load_pegasus_model()

    # Summarize the text in English
    english_summary = summarize_text(document_text, tokenizer, model)

    # Translate the summary to the chosen language
    translated_summary = translate_text_m2m100(english_summary, source_lang="en", target_lang=target_language)

    # Convert both summaries to speech
    text_to_speech(english_summary, lang="en", file_name="english_summary.mp3")
    text_to_speech(translated_summary, lang=target_language, file_name=f"translated_summary_{target_language}.mp3")

    return english_summary, translated_summary

# Step 8: User Input for PDF file path and language code (Indian languages only)
if __name__ == "__main__":
    # Supported Indian languages
    supported_languages = {
        "hi": "Hindi", "bn": "Bengali", "ta": "Tamil", "te": "Telugu",
        "gu": "Gujarati", "mr": "Marathi", "pa": "Punjabi", "ur": "Urdu",
        "kn": "Kannada", "ml": "Malayalam", "or": "Oriya"
    }

    # Upload the PDF file in Colab
    from google.colab import files
    uploaded = files.upload()

    # Assuming a single file is uploaded
    for file_name in uploaded.keys():
        try:
            # Display supported languages
            print("\nSupported Indian Languages for Translation:")
            for code, lang in supported_languages.items():
                print(f"{code}: {lang}")

            target_language = input("Enter the target language code (e.g., 'hi' for Hindi): ").strip()

            if target_language in supported_languages:
                # Generate English and translated summaries with TTS
                english_summary, translated_summary = generate_legal_summary_with_translation_and_tts(file_name, target_language=target_language)

                print("\nSummary in English:\n")
                print(english_summary)

                print(f"\nTranslated Summary ({supported_languages[target_language]}):\n")
                print(translated_summary)
            else:
                print("Invalid language code. Please try again.")

        except Exception as e:
            print(f"An error occurred: {e}")


Saving HackOdisha1.pdf to HackOdisha1 (1).pdf

Supported Indian Languages for Translation:
hi: Hindi
bn: Bengali
ta: Tamil
te: Telugu
gu: Gujarati
mr: Marathi
pa: Punjabi
ur: Urdu
kn: Kannada
ml: Malayalam
or: Oriya
Enter the target language code (e.g., 'hi' for Hindi): bn


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Audio saved as english_summary.mp3
Audio saved as translated_summary_bn.mp3

Summary in English:

On August 22, 2024, the Supreme Court of India (S.C.I.A.) upheld a lower court's decision to dismiss an appeal filed by two employees against their denial of pensionary benefits under the 6th Central Pay Commission ( CPC) Rules, 2008 for being temporary employees of a scheme managed by contributory pooling of funds. The S.C.I.A. ruled that the appellants met the characteristics of regular government servants and therefore were entitled to pensionary benefits under the CPC Rules. The S.C.I.A. held that they had been appointed on a regular pay scale indicating a formalised employee-employer relationship akin to permanent government employees. The S.C.I.A. also held that there was a level of governmental oversight and control consistent with regular government service. The S.C.I.A. dismissed the appeal filed by Pradeep Rajkaran Singh & Ors. v. Union of India & Ors.

Translated Summary (Bengal