# Data Processing and Chunking

In this notebook we are going to:


1.   Read the document using pypdf
2.   Use pypdf to remove the headers and footers containing unnessary information
3.   Remove repetitive and non-informative sentences in the page texts using str.replace()
4.   Chunk by sentences using NLTK
5.   Remove non-paragraphic parts using a rule - every sentence should have at least 2 stop words - done using NLTK
6.   Create two CSV files - one containing the information of a whole page and one chunking the information by context, both are featuring unique keys that can be used to retrieve the original source.



In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.3.1-py3-none-any.whl (295 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.3.1


In [None]:
from google.colab import drive

drive.mount('/content/drive')
file_location = '/content/drive/My Drive/Colab Notebooks/Interview_tasks/JSNOW/Data'

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [None]:
import os
from nltk.tokenize import sent_tokenize
import pandas as pd


def remove_non_ascii(string):
    return "".join(
        char
        for char in string
        if ord(char) < 128
    )


def get_splits(text, n_sent=3):
    tokens = 0
    subset = []
    total = []
    for sent in text:
        tokens += len(sent.split())
        if tokens <= 250:
            subset.append(sent)
        else:
            total.append(subset)
            if n_sent == 0:
                subset = []
            else:
                subset = subset[-n_sent:]
            tokens = 0
            for sub in subset:
                tokens += len(sub.split())
            subset.append(sent)
            tokens += len(sent.split())
    total.append(subset)
    return total

In [None]:
from pypdf import PdfReader
import os

reader = PdfReader(os.path.join(file_location, 'test_guideline.pdf'))
page = reader.pages[70]


def read_a_page(page):
  parts = []

  def visitor_body(text, cm, tm, font_dict, font_size):
      y = tm[5]
      if 50 < y < 720:
          parts.append(text)
  page.extract_text(visitor_text=visitor_body)
  text_body = "".join(parts)
  text_body = text_body.replace("Downloaded from http://ahajournals.org by on March 16, 2024", "")
  text_body = text_body.replace("CLINICAL STATEMENTS AND GUIDELINES", "")
  return text_body

def get_page_splits(page_text):
  text_sent = sent_tokenize(page_text)
  text_sent = [str(sent).replace("\r\n", " ") for sent in text_sent
              #Removal of sentences without at least 2 stopwords
              if len(set(sent.split()).intersection(stopWords))>1]
  text_sentence_splits = get_splits(text_sent)
  return text_sentence_splits

def create_page_index(text_sentence_splits, page_num):
  page_data = []
  for idx, split in enumerate(text_sentence_splits):
    page_context_row = {
        "context_uid":str(page_num)+"_"+str(idx),
        "page": page_num,
        "context_index": idx,
        "context": " ".join(split),
    }
    page_data.append(page_context_row)

  return page_data

df_list = []
df_page_clean = []
for index, page in enumerate(reader.pages):
  if index<6 or index>108:
    continue
  page_text = read_a_page(page)
  page_splits = get_page_splits(page_text)
  page_data = create_page_index(page_splits, index+1)
  df_list.extend(page_data)
  df_page_clean.append({"page_text":page_text, "page":index+1})

df_page_info = pd.DataFrame(df_page_clean)
df_final = pd.DataFrame(df_list)
df_final.head()

In [None]:
df_page_info.to_csv(os.path.join(file_location, "full_page_info.csv"), index=False)

In [None]:
df_final.to_csv(os.path.join(file_location, "data_cleaned.csv"), index=False)