In [18]:
!pip install PyPDF2

import re
import random
from collections import defaultdict
import PyPDF2



In [19]:
def cleaning(file):
  corpus = ""
  try:
    with open(file, "rb") as f:
      pdf_reader = PyPDF2.PdfReader(f)
      for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]

        page_text = page.extract_text()
        if page_text:
          corpus += " " + page_text
  except Exception as e:
    print(f"Error reading PDF: {e}")
    return ""


  text = corpus
  start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK", text)
  end = re.search(r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK", text)

  if start and end:
    text = text[start.end():end.start()]


  text = text.lower()
  text = re.sub(r"[^a-z\s]", " ", text)
  text = re.sub(r"\s+", " ", text)

  return text.strip()


In [20]:
def model(tokens,n=5):
  m = defaultdict(list)

  for i in range(len(tokens)-n):
    context = tuple(tokens[i:i+n-1])
    target = tokens[i+n-1]
    m[context].append(target)

  return m

In [21]:
def generation(model,seedtext,n=5,length=30):
  seedtokens = seedtext.lower().split()
  output = seedtokens[:]

  for i in range(length):
    context = tuple(output[-(n-1):])
    if context not in model:
      break

    next = random.choice(model[context])
    output.append(next)

  return " ".join(output)

In [22]:
if __name__ == "__main__":
  file = "/content/sherlock_holmes.pdf"
  n=5
  text = cleaning(file)

  if not text:
    print("No text extracted")
  else:
    tokens = text.split()
    print(f"Total tokens: {len(tokens)}")
    model = model(tokens,n)
    print(f"Model: {model}")

    inputdata = [ "the day was very","it was evident that","i could not help"]

    for i,seed in enumerate(inputdata,1):
      print(f"\nSample {i}")
      print("Input:",seed)
      op = generation(model,seed,n=n,length=30)
      print("Output:",op)

Total tokens: 107439

Sample 1
Input: the day was very
Output: the day was very

Sample 2
Input: it was evident that
Output: it was evident that he saw clearly not only what had happened but what was about to happen while to me the whole business was still confused and grotesque as i drove home to

Sample 3
Input: i could not help
Output: i could not help remarking its beauty during our short interview but i am afraid that my colleague has been a little quick in forming his conclusions he said but he is right oh
