#### Chunking with on device model to decide, for each processed document, which version of it is more coherent, human-readable

In [1]:
## 1.use local 3B model to determine which version is more human readable produce a cvs for result gathering and score calculation
## 2.use both GPT3 and 3B model to start chunking the selected version 
# if there is only normal read version, use that


In [36]:
pdf_extracted_folder = "/home/yxuhuang/adapter/Data/Extracted/Pdf"
receipts_extracted_folder = "/home/yxuhuang/adapter/Data/Extracted/Receipts"
reports_extracted_folder = "/home/yxuhuang/adapter/Data/Extracted/Reports"
with_format_extracted_folder = "/home/yxuhuang/adapter/Data/Extracted/WithFormat"


pdf_input_folder =  "/home/yxuhuang/adapter/Data/Input/Pdf"
receipts_input_folder = "/home/yxuhuang/adapter/Data/Input/Receipts"
reports_input_folder = "/home/yxuhuang/adapter/Data/Input/Reports"
with_format_input_folder =  "/home/yxuhuang/adapter/Data/Input/WithFormat"

folder_matches = [(receipts_extracted_folder, receipts_input_folder), (pdf_extracted_folder, pdf_input_folder), (with_format_extracted_folder, with_format_input_folder),   (reports_extracted_folder, reports_input_folder)]



In [30]:
import ollama
from pathlib import Path
import requests
import os

class Ollama_asker:
    def __init__(self, url, model):
        self.url = url
        self.model = model
    def chunk_text_with_overlap(self, txt_file, chunk_size=500, overlap=100):
        """Splits text into overlapping chunks."""
        if not os.path.exists(txt_file):
            raise FileNotFoundError(f"Error: The file '{txt_file}' does not exist.")
        with open(txt_file, 'r', encoding='utf-8') as f:
            text = f.read()
        chunks = []
        start = 0
        while start < len(text):
            end_ptr = min(start + chunk_size, len(text))
            chunks.append(text[start:end_ptr])
            start += chunk_size - overlap  # Move forward while keeping overlap

        # for chunk in chunks:
        #     print(chunk)
        return chunks

    def rate(self,chunk):
        result = self.ask_ollama_from_txts(chunk, prompt ="Without Comments, reply only a number: rate the readability and usefullness of the text from 1(Bad) to 10(Good):")
        return result
    def chunk_summarize(self,chunk):
        result = self.ask_ollama_from_txts(chunk, prompt ="Without Comments, summarize in original languange with useful information, below 100 or no words :")
        return result

    def ask_ollama_from_txts(self, chunk, prompt=""):
            #return "" 
            # Send the text as context to Ollama
            response = ollama.chat(
                model= self.model,
                messages=[
                    {"role": "system", "content": "You are a helpful factual text-edit assitant."},
                    {"role": "user", "content": f"{prompt}:\n{chunk}"}
                ]
            )
            return response["message"]["content"]
           




#### For each extracted file, get its average chunk socres, original chunk, and chunk summary to one row

In [31]:
import numpy as np
import pandas as pd
import csv 
import json
import os
url = "http://127.0.0.1:7897"
asker = Ollama_asker("", "llama3.2:3b")
data = []
separator = "\n}<->{\n"
score_base =  "/home/yxuhuang/adapter/Data/Input/"

def score_category(cat: int):
    json_path = score_base + f"scores_{cat}.json"
    header_written = Path(json_path).exists()
    extracted_folder, input_folder = folder_matches[cat]
    print(f"Processing {extracted_folder} to {input_folder}")
    extracted_folder = Path(extracted_folder)
    input_folder = Path(input_folder)
    file = open(json_path, mode="a", encoding="utf-8")

    try: 
        file.write("{\n")  
        for txt_file in extracted_folder.rglob("*.txt"):  # Get all .txt files in the folder
            print(f"checking file: {txt_file}")
            chunks = asker.chunk_text_with_overlap(txt_file=txt_file)


            # Get ratings
            ratings = [asker.rate(chunk=c) for c in chunks]
            summaries = [asker.chunk_summarize(chunk=c) for c in chunks]
            # # Convert ratings to strings
            # ratings = ratings.tolist() if isinstance(ratings, np.ndarray) else ratings
            # chunks = chunks.tolist() if isinstance(chunks, np.ndarray) else chunks
            # summaries = summaries.tolist() if isinstance(summaries, np.ndarray) else summaries

            # Join everything
            all_rating = separator.join(ratings)
            all_summary = separator.join(summaries)
            big_chunk = separator.join(chunks)

            json_entry = {
                "Scores": all_rating,
                "Chunks": big_chunk,
                "Summaries": all_summary
            }
            json.dump({str(txt_file) :json_entry}, file, indent=4)
            # Append data to CSV
            print(f"Saved: {str(txt_file)}")
        file.write("\n}") 
        file.close()
        print(f"Finished processing {extracted_folder} to {input_folder}")
    except Exception as e:
        file.write("\n}") 
        file.close()



# # Convert the list of dictionaries into a DataFrame
# df = pd.DataFrame(data)
# df.to_csv(csv_path, index=False, quoting=csv.QUOTE_NONNUMERIC)

In [25]:
score_category(0)


Processing /home/yxuhuang/adapter/Data/Extracted/Receipts to /home/yxuhuang/adapter/Data/Input/Receipts
checking file: /home/yxuhuang/adapter/Data/Extracted/Receipts/2022/us/hyattbrewhouse_20221016_002_normal.txt


In [38]:
score_category(1)

Processing /home/yxuhuang/adapter/Data/Extracted/Pdf to /home/yxuhuang/adapter/Data/Input/Pdf
checking file: /home/yxuhuang/adapter/Data/Extracted/Pdf/KMELIR3DJDUFB52NSPC42LSIU6OOD77U_ocr.txt


KeyboardInterrupt: 

In [39]:
score_category(2)

Processing /home/yxuhuang/adapter/Data/Extracted/WithFormat to /home/yxuhuang/adapter/Data/Input/WithFormat
checking file: /home/yxuhuang/adapter/Data/Extracted/WithFormat/Company Documents Dataset/CompanyDocuments/invoices/invoice_10380_normal.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/WithFormat/Company Documents Dataset/CompanyDocuments/invoices/invoice_10380_normal.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/WithFormat/Company Documents Dataset/CompanyDocuments/invoices/invoice_10523_ocr.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/WithFormat/Company Documents Dataset/CompanyDocuments/invoices/invoice_10523_ocr.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/WithFormat/Company Documents Dataset/CompanyDocuments/invoices/invoice_10469_ocr.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/WithFormat/Company Documents Dataset/CompanyDocuments/invoices/invoice_10469_ocr.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/WithFormat/Company Documents 

KeyboardInterrupt: 

In [37]:
score_category(3)

Processing /home/yxuhuang/adapter/Data/Extracted/Reports to /home/yxuhuang/adapter/Data/Input/Reports
checking file: /home/yxuhuang/adapter/Data/Extracted/Reports/2012.12877v2_normal.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/Reports/2012.12877v2_normal.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/Reports/2103.12424v3_normal.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/Reports/2103.12424v3_normal.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/Reports/2103.03206v2_normal.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/Reports/2103.03206v2_normal.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/Reports/2104.01136v2_normal.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/Reports/2104.01136v2_normal.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/Reports/2103.14899v2_normal.txt
Saved: /home/yxuhuang/adapter/Data/Extracted/Reports/2103.14899v2_normal.txt
checking file: /home/yxuhuang/adapter/Data/Extracted/Reports/2103.17239v2_normal.txt
Sav

KeyboardInterrupt: 