# Text einlesen, encoden und als Enbeddings in Datenbank speichern

In [1]:
import fitz  # PyMuPDF
import tensorflow.compat.v1 as tf
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import os




### PDF einlesen und in Textkette speichern

In [2]:
def extract_text_from_pdf_directory(directory_path):
    text = ""
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, file_name)
            doc = fitz.open(pdf_path)
            for page in doc:
                text += page.get_text()
            doc.close()
    return text

# Verzeichnis mit PDFs
directory_path = "C:/Users/rapha/Documents/Datascience/5. Semester/Artificial Intelligence/Einzelarbeit/Data"

extracted_text = extract_text_from_pdf_directory(directory_path)

In [3]:
# Modell für das Encoding bestimmen
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Text in kleinere Abschnitte aufteilen
text_segments = extracted_text.split("\n")  

# Transformer Modell verwenden um Textsegmente zu encoden
embeddings = model.encode(text_segments)

In [4]:
# Embeddings als Index in Datenbank schreiben
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2-Distanz
index.add(np.array(embeddings))
faiss.write_index(index, "faiss_index.db") 

# Mapping von Segment-IDs zum Text speichern
segment_mapping = {i: text for i, text in enumerate(text_segments)}

# Mapping als JSON speichern
with open("segment_mapping.json", "w", encoding="utf-8") as f:
    json.dump(segment_mapping, f)