<a href="https://colab.research.google.com/github/Tanaya2012/-Denoising-Autoencoder/blob/master/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pymupdf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymupdf
  Downloading PyMuPDF-1.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.22.3


In [3]:
import fitz
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics

import string
import spacy
import re

In [4]:
pdf_paths = ['/content/drive/MyDrive/mm6411.pdf', '/content/drive/MyDrive/mm6549a5.pdf', '/content/drive/MyDrive/mm6832a3-H.pdf', '/content/drive/MyDrive/rr6305.pdf']

In [5]:
def fonts(paths):
    """Extracts fonts and their usage in PDF documents.
    :param paths: link to pdf documents
    :type doc: list
    :rtype: (text_font, text_style)
    :return: most used font and size in the whole text
    """
    styles = {}
    font_counts = {}

    for path in pdf_paths:
      with fitz.open(path) as doc:
        for page in doc:
          blocks = page.get_text("dict")["blocks"]
          for b in blocks:  # iterate through the text blocks
              if b['type'] == 0:  # block contains text
                  for l in b["lines"]:  # iterate through the text lines
                      for s in l["spans"]:  # iterate through the text spans
                        identifier= (s['size'], s['font'])
                        if identifier in font_counts.keys():
                          font_counts[identifier] += 1
                        else:
                          font_counts[identifier] = 1

    keys = list(font_counts.keys())
    values = list(font_counts.values())
    sorted_value_index = np.argsort(values)[::-1]
    font_counts = {keys[i]: values[i] for i in sorted_value_index}

    p_style = list(font_counts.keys())[0]

    return p_style

In [6]:
def get_para(paths, p_style):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for path in pdf_paths:
      with fitz.open(path) as doc:
        for page in doc:
          blocks = page.get_text("dict")["blocks"]
          for b in blocks:
            style_dict = {}
            block_string = ""  # text found in block
            # iterate through the text blocks
            if b['type'] == 0:  # this block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:
                      if s['text'].strip():  # iterate through the text spans
                        font_size = s['size']
                        span_font = s['font']
                        
                        key = (span_font, font_size)
                        if key in style_dict.keys():
                          style_dict[key] += 1
                        else:
                          style_dict[key] = 1

                most_used_style = max(zip(style_dict.values(), style_dict.keys()))[1]
                if most_used_style[0] == p_style[1] and most_used_style[1] == p_style[0]:
                  for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:
                      block_string += s['text']
                header_para.append(block_string)

    return header_para

In [7]:
p_style = fonts(pdf_paths)
pdf_text = get_para(pdf_paths, p_style)
pdf_text = list(filter(None, pdf_text))
pdf_text = ' '.join(pdf_text)
pdf_text = re.sub(r'http\S+', '', pdf_text, flags=re.MULTILINE)

In [8]:
tokens = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', pdf_text)
punctuations = string.punctuation
tokens = [test_str.translate(str.maketrans('', '', string.punctuation)) for test_str in tokens]

In [9]:
for idx, token in enumerate(tokens):
  words = token.split(" ")
  last_word = words[-1]
  if last_word.isdigit():
    tokens[idx] = ' '.join(token.split(" ")[:-1])

In [10]:
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords

def clean_sentence(sentence, stopwords=False):
  sentence = sentence.lower().strip()
  sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
  if stopwords:
    sentence = remove_stopwords(sentence)
  return sentence

def get_cleaned_sentences(tokens, stopwords=False):
  cleaned_sentences = []
  for row in tokens:
    cleaned = clean_sentence(row, stopwords)
    cleaned_sentences.append(cleaned)
  return cleaned_sentences

In [11]:
cleaned_sentences = get_cleaned_sentences(tokens, stopwords=True)
cleaned_sentences_with_stopwords = get_cleaned_sentences(tokens, stopwords=False)
pdf_text = ' '.join(cleaned_sentences_with_stopwords)

In [13]:
file = open("/content/drive/MyDrive/pdf_text.txt", "a")
a = file.write(pdf_text)
file.close()

In [14]:
with open(r"/content/drive/MyDrive/cleaned_sentences.txt", 'w') as fp:
    for item in cleaned_sentences_with_stopwords:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done
