In [226]:
from typing import List, Optional

def read_all_pages_into_lines(file_prefix: str, file_suffix: str, doc_dir: Optional[str] = "data") -> List[List[str]]:
    lines = []

    for i in range(1, 6):
        with open(f"{doc_dir}/{file_prefix}{i}.{file_suffix}", "r") as f:
            lines.append(f.readlines())

    return lines

In [213]:
import torch
from torch.types import Tensor
from llama_index.core.embeddings import BaseEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.ollama import OllamaEmbedding
from typing import Callable, Optional

def get_embedder(model_name: str, **kwargs) -> Callable[[str], Tensor]:
    ollama_config = kwargs.pop("ollama_config", {
        "base_url": "http://192.168.0.124:11434",
    })
    hf_config = kwargs.pop("huggingface_config", {
        "trust_remote_code": True,
        "cache_folder": "models/",
    })

    provider_map = {
        "ollama": OllamaEmbedding,
        "hf": HuggingFaceEmbedding,
    }

    config_map = {
        "ollama": ollama_config,
        "hf": hf_config,
    }

    provider, *model = model_name.split("/")
    model = "/".join(model)

    embedder: BaseEmbedding = provider_map[provider](model_name=model, **config_map[provider], **kwargs)

    def embed(x1: str) -> Tensor:
        return torch.asarray(embedder.get_text_embedding(x1))

    return embed

def compute_similiarity(x1: Tensor, x2: Tensor, dim: Optional[int] = 0, eps: Optional[float] = 1e-4) -> Tensor:
    return torch.nn.CosineSimilarity(dim=dim, eps=eps)(x1, x2)

In [227]:
lines = read_all_pages_into_lines("page", "txt")

In [214]:
embed = get_embedder("hf/ibm-granite/granite-embedding-278m-multilingual")

In [None]:
embeddings1 = embed("Hello world")
embeddings2 = embed("hell world")
sim = compute_similiarity(embeddings1, embeddings2)

In [249]:
no_of_pages = len(lines)
while True:
    counter = 24
    embeddings = []
    lines_ = []
    for i in range(no_of_pages):
        try:
            line = lines[i][counter]
        except IndexError:
            line = ''

        lines_.append(line)

    embeddings.extend([embed(line) for line in lines_])

    embedding_matrix = torch.zeros(no_of_pages, no_of_pages)
    for i, embedding in enumerate(embeddings):
        for j, embedding2 in enumerate(embeddings):
            ans = compute_similiarity(embeddings[i], embeddings[j])
            embedding_matrix[i, j] = ans

    counter += 1
    break

In [250]:
embedding_matrix

tensor([[1.0000, 0.5712, 1.0000, 0.5173, 0.5173],
        [0.5712, 1.0000, 0.5712, 0.4912, 0.4912],
        [1.0000, 0.5712, 1.0000, 0.5173, 0.5173],
        [0.5173, 0.4912, 0.5173, 1.0000, 1.0000],
        [0.5173, 0.4912, 0.5173, 1.0000, 1.0000]])

In [251]:
lines_

['Number revised contents\n',
 '15       All pages                                                                        Cleaning of rollers to avoid scratches on film                 June 30, 2014\n',
 'Number revised contents\n',
 'h Dharkar, Naveen\n',
 'h Dharkar, Naveen\n']