In [None]:
!pip install PyPDF2 transformers langdetect



In [None]:
import os
from PyPDF2 import PdfReader
from langdetect import detect, DetectorFactory, lang_detect_exception
from transformers import AutoTokenizer , AutoModelForSeq2SeqLM


In [None]:
#Setting up the key
HUGGINGFACE_API_KEY = "your key"

In [None]:
#function to extract the text from the pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

In [None]:
#function to translate text using NLLB mdoel
from transformers import AutoTokenizer, M2M100ForConditionalGeneration
def translate_text_nllb(text, target_language):
  '''tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B", use_auth_token=HUGGINGFACE_API_KEY)
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B", use_auth_token=HUGGINGFACE_API_KEY)
  inputs = tokenizer(text, return_tensors="pt", padding = True)
  translated_tokens = model.generate(
      inputs["input_ids"],
      forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
  )
  translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]'''

  model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
  tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")

  text_to_translate = text
  model_inputs = tokenizer(text_to_translate, return_tensors="pt")

  # translate to French
  gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id(target_language ))
  translated_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
  return translated_text

In [None]:
#funtion to detect language
def detect_language(text):
    try:
        DetectorFactory.seed = 0
        language = detect(text)
        return language
    except lang_detect_exception.LangDetectException:
        return None
# Map detected languages to NLLB-200 language codes

lang_code_map = {
    "af": "afr_Latn",
    "am": "amh_Ethi",
    "ar": "arb_Arab",
    "as": "asm_Beng",
    "az": "aze_Latn",
    "be": "bel_Cyrl",
    "bg": "bul_Cyrl",
    "bn": "ben_Beng",
    "bs": "bos_Latn",
    "ca": "cat_Latn",
    "ceb": "ceb_Latn",
    "cs": "ces_Latn",
    "cy": "cym_Latn",
    "da": "dan_Latn",
    "de": "deu_Latn",
    "el": "ell_Grek",
    "en": "eng_Latn",
    "es": "spa_Latn",
    "et": "est_Latn",
    "fa": "pes_Arab",
    "fi": "fin_Latn",
    "fr": "fra_Latn",
    "gu": "guj_Gujr",
    "ha": "hau_Latn",
    "he": "heb_Hebr",
    "hi": "hin_Deva",
    "hr": "hrv_Latn",
    "hu": "hun_Latn",
    "hy": "hye_Armn",
    "id": "ind_Latn",
    "is": "isl_Latn",
    "it": "ita_Latn",
    "ja": "jpn_Jpan",
    "jv": "jav_Latn",
    "ka": "kat_Geor",
    "kk": "kaz_Cyrl",
    "km": "khm_Khmr",
    "kn": "kan_Knda",
    "ko": "kor_Hang",
    "lo": "lao_Laoo",
    "lt": "lit_Latn",
    "lv": "lav_Latn",
    "mg": "plt_Latn",
    "mk": "mkd_Cyrl",
    "ml": "mal_Mlym",
    "mn": "khk_Cyrl",
    "mr": "mar_Deva",
    "ms": "zsm_Latn",
    "my": "mya_Mymr",
    "ne": "npi_Deva",
    "nl": "nld_Latn",
    "no": "nob_Latn",
    "or": "ori_Orya",
    "pa": "pan_Guru",
    "pl": "pol_Latn",
    "ps": "pus_Arab",
    "pt": "por_Latn",
    "ro": "ron_Latn",
    "ru": "rus_Cyrl",
    "si": "sin_Sinh",
    "sk": "slk_Latn",
    "sl": "slv_Latn",
    "so": "som_Latn",
    "sq": "sqi_Latn",
    "sr": "srp_Cyrl",
    "su": "sun_Latn",
    "sv": "swe_Latn",
    "ta": "tam_Taml",
    "te": "tel_Telu",
    "tg": "tgk_Cyrl",
    "th": "tha_Thai",
    "ti": "tir_Ethi",
    "tr": "tur_Latn",
    "uk": "ukr_Cyrl",
    "ur": "urd_Arab",
    "uz": "uzb_Latn",
    "vi": "vie_Latn",
    "xh": "xho_Latn",
    "yo": "yor_Latn",
    "zh-cn": "zho_Hans",
    "zh-tw": "zho_Hant",
    "zu": "zul_Latn",
}

In [None]:
#main function:
def main():
  pdf_path = input("Enter the path to the pdf")

  if  not os.path.exists(pdf_path):
    print(f"Error: the file at {pdf_path} does not exist")
    return

  #text extraction
  print("Extracting text from the pdf")
  text = extract_text_from_pdf(pdf_path)
  print("Text extraction completed!")
  print(text)

  #detecting source language
  print("\nDetecting source language")

  detected_lang = detect_language(text)
  if detected_lang:
    source_lang_code = lang_code_map.get(detected_lang)
    if source_lang_code:
      print(f"Detected language: {detected_lang} ({source_lang_code})")
    else:
      print(f"Detected language: {detected_lang}, but no corresponding NLLB language code found")
      return
  else:
    print("Language detection failed")


  #select the target language
  print("\nAvailable target languages:")
  for lang, code in lang_code_map.items():
    print(f"- {lang} ({code})")

  target_lang = input("\nSelect the target language (language code): ")

  if target_lang not in lang_code_map.keys():
    print("Error: Invalid language selection")
    return

  #translate text
  print("\nTranslating text....")
  translated_text = translate_text_nllb(text , target_lang)
  print("Translation completed!")
  print("Translated_text:  " ,translated_text)

if __name__ == "__main__":
  main()

Enter the path to the pdf/content/sodapdf-converted.pdf
Extracting text from the pdf
Text extraction completed!
Sure! Here's a brief conversation between two people:  
  
---  
  
**Alex:** Hey Taylor, how was your weekend?  
  
**Taylor:** Hi Alex! It was great, thanks for asking. I went hiking on Saturday and just relaxed at home on 
Sunday. How about you?  
  
**Alex:** That sounds awesome! I spent most of my weekend catching up on some work and watched a 
few movies.   
  
**Taylor:** Nice, any recommendations for the movies?  
  
**Alex:** Definitely! If you haven't seen *Inception*, it's a must-watch. And I really enjoyed *The Grand 
Budapest Hotel* too.  
  
**Taylor:** I'll check those out. Thanks for the suggestions!   
  
**Alex:** No problem. Let me know what you think once you've watched them!  
  
**Taylor:** Will do. See you around!  
  
**Alex:** See you!  
  
---  
  
Feel free to adjust the context or add any specific details if needed!

Detecting source language
Detec

In [None]:
from transformers import AutoTokenizer, M2M100ForConditionalGeneration

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")

text_to_translate = "Life is like a box of chocolates"
model_inputs = tokenizer(text_to_translate, return_tensors="pt")

# translate to French
gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))

['La vie est comme une boîte de chocolat']


In [None]:
 ['निश्चित रूप से! यहाँ दो लोगों के बीच एक संक्षिप्त बातचीत है: ** एलेक्स:** हे टेलर, आपका सप्ताहांत कैसा था? ** टेलर:** हाय एलेक्स! यह बहुत अच्छा था, पूछने के लिए धन्यवाद. मैं शनिवार को घूम रहा था और रविवार को घर पर बस आराम कर रहा था. आप के बारे में क्या? ** एलेक्स:** यह अद्भुत लगता है. मैंने अपने सप्ताहांत का अधिकांश हिस्सा कुछ काम पर पकड़ लिया और कुछ फिल्मों को देखा. ** टेलर:** अच्छा, फिल्मों के लिए कोई सिफारिशें? ** एलेक्स:** निश्चित रूप से!']

['निश्चित रूप से! यहाँ दो लोगों के बीच एक संक्षिप्त बातचीत है: ** एलेक्स:** हे टेलर, आपका सप्ताहांत कैसा था? ** टेलर:** हाय एलेक्स! यह बहुत अच्छा था, पूछने के लिए धन्यवाद. मैं शनिवार को घूम रहा था और रविवार को घर पर बस आराम कर रहा था. आप के बारे में क्या? ** एलेक्स:** यह अद्भुत लगता है. मैंने अपने सप्ताहांत का अधिकांश हिस्सा कुछ काम पर पकड़ लिया और कुछ फिल्मों को देखा. ** टेलर:** अच्छा, फिल्मों के लिए कोई सिफारिशें? ** एलेक्स:** निश्चित रूप से!']