In [90]:
from typing import List, Optional
def read_all_pages_into_lines(file_prefix: str, file_suffix: str, doc_dir: Optional[str] = "data") -> List[List[str]]:
    lines = []

    for i in range(1, 6):
        with open(f"{doc_dir}/{file_prefix}{i}.{file_suffix}", "r") as f:
            lines.append(f.readlines())

    return lines

In [92]:
import torch
from torch.types import Tensor
from llama_index.core.embeddings import BaseEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.ollama import OllamaEmbedding
from typing import Callable, Optional

def get_embedder(model_name: str, **kwargs) -> Callable[[str], Tensor]:
    ollama_config = kwargs.pop("ollama_config", {
        "base_url": "http://192.168.0.124:11434",
    })
    hf_config = kwargs.pop("huggingface_config", {
        "trust_remote_code": True,
        "cache_folder": "models/",
    })

    provider_map = {
        "ollama": OllamaEmbedding,
        "hf": HuggingFaceEmbedding,
    }

    config_map = {
        "ollama": ollama_config,
        "hf": hf_config,
    }

    provider, *model = model_name.split("/")
    model = "/".join(model)

    embedder: BaseEmbedding = provider_map[provider](model_name=model, **config_map[provider], **kwargs)

    def embed(x1: str) -> Tensor:
        return torch.asarray(embedder.get_text_embedding(x1))

    return embed

def compute_similiarity(x1: Tensor, x2: Tensor, dim: Optional[int] = 0, eps: Optional[float] = 1e-4) -> Tensor:
    return torch.nn.CosineSimilarity(dim=dim, eps=eps).forward(x1, x2)

In [120]:
lines = read_all_pages_into_lines("page", "txt")


In [109]:
embed = get_embedder("hf/ibm-granite/granite-embedding-278m-multilingual")

In [137]:
from collections import defaultdict
from time import sleep

def compute_embedding_matrix(matrix: Tensor, embeddings: List[Tensor]) -> Tensor:
    for i, e1 in enumerate(embeddings):
        for j, e2 in enumerate(embeddings):
            if i == j:
                ans = torch.tensor(1.0, dtype=torch.float)
            else:
                ans = compute_similiarity(e1, e2) if not embedding_matrix[j, i] else embedding_matrix[j, i]
            embedding_matrix[i, j] = ans

    return embedding_matrix


new_lines = [
    [line for line in page if line.strip()] for page in lines
]

print(new_lines[0][10], new_lines[-2][10])
sleep(10)

with open("old_pages", "w") as f:
    for line in new_lines:
        f.writelines(line)
        f.write("\n" + ("="*100) + "\n")

no_of_pages = len(new_lines)
new_pages: List[List[str]] = [[] for _ in range(no_of_pages)]
lens = list(map(len, new_lines))
max_len = max(lens)

counter = 0
while counter < max_len:
    embeddings: List[Tensor] = []
    lines_ = []
    for i in range(no_of_pages):
        if counter < lens[i]:
            line = new_lines[i][counter]
        else:
            line = ''

        lines_.append(line)

    embeddings.extend([embed(line) for line in lines_])

    embedding_matrix = torch.zeros(no_of_pages, no_of_pages)
    embedding_matrix = compute_embedding_matrix(embedding_matrix, embeddings)

    # Remove similar lines and store the results page by page
    _discard_index = set(
        j+i+1
        for i, e1 in enumerate(embedding_matrix)
        for j, e2 in enumerate(e1[i+1:])
        if e2 > 0.86
    )

    # print(*lines_)
    # print(embedding_matrix)
    # print(_discard_index)
    #
    # print()

    for i in range(no_of_pages):
        if i not in _discard_index:
            new_pages[i].append(lines_[i])

    counter += 1

Revision Page Numbers of
                                                                                                                                                   Dawalkar, Yogesh Patil, Shailesh Bairagi, Sandeep Khalkar, Pankaj Phalak, Kartik                                Customer Engineering Approval Date (if Req'd): Nil                                                        Symbol                      Meaning



In [138]:
with open("new_pages.txt", "w") as f:
    for line in new_pages:
        f.write("".join(line))
        f.write("\n" + ("="*100) + "\n")

In [144]:
import re
page = new_pages[1]

resp = re.findall("  +", "".join(page))

In [145]:
min_spaces = min(map(len, resp))

In [151]:
page = "".join(page)
page = re.sub(r" {"+f"{min_spaces}"+r"}", "  ", "".join(page))

with open("page.txt", "w") as f:
    f.write(page)