# Document preprosesing and embedding creation

### steps:
* import pdf
* convert chunks
* embed checks 
* save embeding

In [35]:
import os
import requests
import ollama
pdf_path = 'OSHA -Module 1.pdf'

!pip install pymupdf

In [36]:
import fitz
from tqdm import tqdm
def text_formatter(text : str) -> str:
    """# Document preprosesing and embedding creation"""
    cleaned_text = text.replace("\n"," ").strip() 
    return cleaned_text
    
def open_and_read(pdf_path):
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for pagenumber, page in tqdm(enumerate(doc),desc = "Loading Document .."):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_text.append({
            "page_number":pagenumber+1,
            "page_char_count":len(text),
            "page_word_count":len(text.split(" ")),
            "page_sentense_count":len(text.split('.')),
            "page_token_count":len(text) / 4,
            "text":text
            })
    return pages_and_text

pages_and_text = open_and_read(pdf_path=pdf_path)
pages_and_text[:2]

Loading Document ..: 123it [00:00, 472.05it/s]


[{'page_number': 1,
  'page_char_count': 43,
  'page_word_count': 6,
  'page_sentense_count': 1,
  'page_token_count': 10.75,
  'text': 'Occupational Hazard and Control  Principles'},
 {'page_number': 2,
  'page_char_count': 496,
  'page_word_count': 86,
  'page_sentense_count': 2,
  'page_token_count': 124.0,
  'text': 'World Health Organization • Occupational health deals with all aspects of health  and safety in the workplace and has a strong focus  on primary prevention of hazards • Health has been defined as -a state of complete  physical, mental and social well-being and not  merely the absence of disease or infirmity • Occupational health is a multidisciplinary field  of healthcare concerned with enabling an individual  to undertake their occupation, in the way that causes  least harm to their health.'}]

In [37]:
import pandas as pd
dataframe = pd.DataFrame(pages_and_text)
dataframe.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentense_count,page_token_count,text
0,1,43,6,1,10.75,Occupational Hazard and Control Principles
1,2,496,86,2,124.0,World Health Organization • Occupational healt...
2,3,361,56,5,90.25,• The safety movement in the United States has...
3,4,819,117,2,204.75,Change over the past three decades Some of the...
4,5,604,110,5,151.0,Definitions • Ergonomics: is the process of de...


In [38]:
from numpy.linalg import norm
import numpy as np
import json
def get_embedding(model, chunks):
    embeddings = []
    for chunk in tqdm(chunks, desc="Generating embeddings", unit=" chunks"):
        try:
            response = ollama.embeddings(model=model, prompt=chunk)
            embedding = response['embedding']
            if len(embedding) > 0:  # Ensure the embedding is not empty
                embeddings.append(embedding)
            else:
                print(f"Empty embedding generated for chunk: {chunk[:30]}...")  # Print the first 30 characters of the chunk for context
        except Exception as e:
            print(f"Error generating embedding for chunk: {e}")
            continue
    return embeddings

def make_embedding_and_save(file_path,model,chunks):
    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            embedding = get_embedding(model=model, chunks=chunks)
            json.dump(embedding, f)
            return embedding
    else:
        with open(file_path, 'r') as f:
            return json.load(f)


In [39]:
def find_most_similar(needle, haystack):
    needle_norm = norm(needle)
    similarity_scores = []
    for idx, iteam in enumerate(haystack):
        if len(iteam) == len(needle):  # Ensure dimensions match before calculation
            score = np.dot(needle, iteam) / (needle_norm * norm(iteam))
            similarity_scores.append((score, idx))
        else:
            print(f"Skipping embedding at index {idx} due to shape mismatch.")
    return sorted(similarity_scores, reverse=True)

In [40]:
import numpy as np
from numpy.linalg import norm

def euclidean_distance(v1, v2):
    """Calculate the Euclidean distance between two vectors."""
    v1 = np.array(v1)  # Convert to NumPy array if not already
    v2 = np.array(v2)  # Convert to NumPy array if not already
    return np.sqrt(np.sum((v1 - v2) ** 2))

def find_most_similar_euclidean(needle, haystack):
    """
    Find the most similar items in haystack to the needle using Euclidean distance.
    
    Parameters:
    - needle: The query vector (embedding) to compare.
    - haystack: List of vectors (embeddings) to compare against.
    
    Returns:
    - A sorted list of tuples containing the distance and the index of the most similar items.
    """
    similarity_scores = []
    for idx, iteam in enumerate(haystack):
        if len(iteam) == len(needle):
            score = euclidean_distance(needle, iteam)
            similarity_scores.append((score, idx))
        else:
            print(f"Skipping embedding at index {idx} due to shape mismatch.")
    return sorted(similarity_scores)

In [41]:
embedding = make_embedding_and_save('embedding.json',"nomic-embed-text", dataframe.text)

In [42]:
prompt="""Government of India 
Ministry of Labour and Employment"""
prompt_embeddings = ollama.embeddings(model="nomic-embed-text",prompt=prompt)['embedding']
most_similar_chunks = find_most_similar(prompt_embeddings,embedding)[:3]
for iteam in most_similar_chunks:
    print(iteam[0], dataframe.text[iteam[1]])

0.7923275532661243 • The state legislated safety requirements only in specific industries, had inadequate safety and health standards, and had inadequate budgets for enforcement. . . . The injury and death toll due to industrial mishaps was still . . . too high. • In the late 1960s, more than 14,000 employees were killed annually in connection with their jobs. . . . Work injury rates were taking an upward swing. These were the primary reasons behind passage of the Occupational  Safety and Health Act (OSH Act) of 1970 and the Federal Mine Safety  Act of 1977
0.6264882726838171 GOALS The Government firmly believes that building and maintaining national preventive safety and health culture is the need of the hour. With a view to develop such a culture and to improve the safety, health and environment at work place, it is essential to meet the following requirements:- • providing a statutory framework on Occupational Safety and Health in respect of all sectors of industrial activities incl

In [43]:
prompt="""Government of India 
Ministry of Labour and Employment"""
prompt_embeddings = ollama.embeddings(model="nomic-embed-text",prompt=prompt)['embedding']
most_similar_euclidean = find_most_similar_euclidean(prompt_embeddings, embedding)[:10]
for score, idx in most_similar_euclidean:
    content = dataframe.text.iloc[idx]  # Retrieve the content from the DataFrame
    print(f"Index: {idx}, Distance: {score}, Content: {content}")

Index: 25, Distance: 13.201990703054639, Content: • The state legislated safety requirements only in specific industries, had inadequate safety and health standards, and had inadequate budgets for enforcement. . . . The injury and death toll due to industrial mishaps was still . . . too high. • In the late 1960s, more than 14,000 employees were killed annually in connection with their jobs. . . . Work injury rates were taking an upward swing. These were the primary reasons behind passage of the Occupational  Safety and Health Act (OSH Act) of 1970 and the Federal Mine Safety  Act of 1977
Index: 29, Distance: 17.43305908358347, Content: GOALS The Government firmly believes that building and maintaining national preventive safety and health culture is the need of the hour. With a view to develop such a culture and to improve the safety, health and environment at work place, it is essential to meet the following requirements:- • providing a statutory framework on Occupational Safety and H