In [2]:
from dotenv import load_dotenv
import os
import pandas as pd

#load our environment
load_dotenv()

ENV_NAM = os.getenv("ENV_NAME")

In [3]:
# method to get the number of tokens of a text string

import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    token_integers = encoding.encode(string)
    num_tokens = len(token_integers)

    return num_tokens

In [4]:
# method to get the token length with the encoding

tokenizer_name = tiktoken.get_encoding("cl100k_base")
tokenizer = tiktoken.get_encoding(tokenizer_name.name)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=8000,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
)

In [None]:
# reads files from the pdf folder and creates a txt file from each pdf and puts it into the txt directory

import os
from os import listdir
from os.path import isfile, join

from io import StringIO

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def convert_pdf_to_txt(path, txt_dir):
    fname = os.path.basename(path)[:-4]
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,caching=caching, check_extractable=True):
        interpreter.process_page(page)
        #define tow path for store the pdf to txt file
    fp.close()
    device.close()
    text = retstr.getvalue()
    file_name_txt = fname+".txt"
    txt_path = os.path.join(txt_dir, file_name_txt)
    file = open(txt_path, "w", encoding=codec)
    file.write(text)
    file.close()
    retstr.close()

#path of the PDF file
srcDir = "./pdf/"
#path for saving the txt files
txtDir = "./txt/"
#scan files from Soruce directory
for file in listdir(srcDir):
    #check for PDF
    if file.endswith(".pdf"):
        file_path = os.path.join(srcDir, file)
        convert_pdf_to_txt(file_path, txtDir)


In [None]:
#cleanup the txt files and remove special characters and write file as utf-8 encoded

import os
import codecs

# Path to the directory containing the text files
directory = "./txt/"

# Iterate through the directory
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file in read mode and read its contents
        with open(os.path.join(directory, filename), 'r', encoding="utf-8") as f:
            contents = f.read()
            # Convert the encoding to utf-8
            contents = contents.replace('\n', ' ').replace('â€', '').replace('Â', '').replace('©', '').replace('*', '').replace('•', '').replace('*', '').replace('“', '').replace('”', '').replace('', '').replace("♦", '')
            # Open the file in write mode and write the utf-8 encoded contents
            with codecs.open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
                f.write(contents)  

In [7]:
# open and read all the txt files and put them into chunks in a dataframe, this takes the contents of
# the file and splits based on the text splitter.  this needs to be split because of the embeddings
# columns will be title, tokens, content, summary, source

path = "./txt/"
chunk = {}
txt = []

for txt_file in os.listdir(path):
    if txt_file.endswith(".txt"):
        with open(os.path.join(path, txt_file), "r", encoding="UTF-8") as f:
            text = f.read()
            texts = text_splitter.create_documents([text])
            for i in texts:
                chunk = {
                        "title": txt_file[:-4],  # remove the .txt extension
                        "tokens": num_tokens_from_string(i.page_content, "cl100k_base"),
                        "content": i.page_content,
                        "source": txt_file,
                        }
                txt.append(chunk)

df = pd.DataFrame(txt)

In [8]:
df

Unnamed: 0,title,tokens,content,source
0,Merck,7649,"9-398-033 R E V : O C T O B E R 1 7 , 2 ...",Merck.txt
1,Merck,7510,"went through a similar 360-degree process, ...",Merck.txt
2,Merck,263,Company document. 16 This document is auth...,Merck.txt
