In [1]:
# similar to https://codeawake.com/blog/postgresql-vector-database

import sys
import os 
sys.path.append('../')

from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy import Text
from sqlalchemy.dialects.postgresql import JSONB
from pgvector.sqlalchemy import Vector
import asyncpg

import src.setting as s


In [6]:
import os
os.getcwd()



'/mnt/c/Users/Anna/Documents/TUB_SWN/_PROJECTS/CI-impacts-information-retrieval/notebooks'

In [None]:
## create vector DB with postgresql
class Base(DeclarativeBase):
    pass

class Vector(Base):
    __tablename__ = 'vectors'

    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    text: Mapped[str] = mapped_column(Text)
    vector = mapped_column(Vector(1024))  # set embedding dimensions, match with chosen embedding model
    metadata_: Mapped[dict | None] = mapped_column('metadata', JSONB)

    def __repr__(self):
        return f'Vector(id={self.id}, text={self.text[:50]}..., metadata={self.metadata_})'

In [7]:
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine

DB_URL = 'postgresql+asyncpg://admin:postgres@localhost:5432/vectordb'

engine = create_async_engine(DB_URL)

async def db_create():
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.create_all)
    print(engine.url, "connected and tables created.")
    

engine = create_async_engine(DB_URL)
Session = async_sessionmaker(engine, expire_on_commit=False)


In [8]:
Session

async_sessionmaker(class_='AsyncSession', bind=<sqlalchemy.ext.asyncio.engine.AsyncEngine object at 0x7f6ede6e6790>, autoflush=True, expire_on_commit=False)

In [None]:
# ## extract text from pdf via pypdf
# import pypdf
# import json

# def extract_text_from_pdf(file_path: str) -> str:
#     text_list = []
#     with open(file_path, "rb") as file:
#         reader = pypdf.PdfReader(file)
#         for page in reader.pages:
#             text_list.append(page.extract_text())
#             #text_list.append(page.extract_text() + "\n")
#     return "\n\n".join(text_list)


# text_source_name = "Koks et al - 2022 - Brief communication"  # define which pdf should be read converted to txt
# with open("../" + s.PATH_DATA + f"{text_source_name}.txt", "w+") as f:
#     json.dump(
#         extract_text_from_pdf("../" + s.PATH_DATA + f"text_sources/{text_source_name}.pdf"), 
#         f,
#     )


In [5]:
## extracting text from pdfs using pdfminer

from pdfminer.high_level import extract_text

docs = []
DOCS_DIR = "../" + s.PATH_DATA + "text_sources/"

for filename in os.listdir(DOCS_DIR):
    if filename.endswith('.pdf'):
        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        print(text)
        docs.append(text)


Nat. Hazards Earth Syst. Sci., 22, 3831–3838, 2022
https://doi.org/10.5194/nhess-22-3831-2022
© Author(s) 2022. This work is distributed under
the Creative Commons Attribution 4.0 License.

Brief communication: Critical infrastructure impacts of the 2021
mid-July western European ﬂood event

Elco E. Koks1,2, Kees C. H. van Ginkel3,1, Margreet J. E. van Marle3, and Anne Lemnitzer4
1Institute for Environmental Studies, Vrije Universiteit Amsterdam, the Netherlands
2Oxford Programme for Sustainable Infrastructure Systems, Environmental Change Institute,
University of Oxford, Oxford, United Kingdom
3Deltares, Delft, the Netherlands
4University of California, Irvine, Irvine, California, United States of America

Correspondence: Kees C. H. van Ginkel (kees.vanginkel@deltares.nl)

Received: 17 December 2021 – Discussion started: 23 December 2021
Revised: 10 August 2022 – Accepted: 18 October 2022 – Published: 29 November 2022

Abstract. Germany, Belgium and the Netherlands were hit
by extreme

['Nat. Hazards Earth Syst. Sci., 22, 3831–3838, 2022\nhttps://doi.org/10.5194/nhess-22-3831-2022\n© Author(s) 2022. This work is distributed under\nthe Creative Commons Attribution 4.0 License.\n\nBrief communication: Critical infrastructure impacts of the 2021\nmid-July western European ﬂood event\n\nElco E. Koks1,2, Kees C. H. van Ginkel3,1, Margreet J. E. van Marle3, and Anne Lemnitzer4\n1Institute for Environmental Studies, Vrije Universiteit Amsterdam, the Netherlands\n2Oxford Programme for Sustainable Infrastructure Systems, Environmental Change Institute,\nUniversity of Oxford, Oxford, United Kingdom\n3Deltares, Delft, the Netherlands\n4University of California, Irvine, Irvine, California, United States of America\n\nCorrespondence: Kees C. H. van Ginkel (kees.vanginkel@deltares.nl)\n\nReceived: 17 December 2021 – Discussion started: 23 December 2021\nRevised: 10 August 2022 – Accepted: 18 October 2022 – Published: 29 November 2022\n\nAbstract. Germany, Belgium and the Netherlan

In [None]:
# define recursive chunking
# for sentence splitting use NLTK unsupervised sentence tokenizer (https://www.nltk.org/api/nltk.tokenize.punkt.html) 
from nltk.tokenize import PunktTokenizer

def split_sentences(self):
    sentence_detector = PunktTokenizer()
    return '\n---\n'.join(sentence_detector.tokenize(text.strip()))


class TextSplitter:
    def __init__(self, chunk_size):
        self.chunk_size = chunk_size
        self.splitters = [
            partial(split_by_separator, sep='\n\n'),
            partial(split_by_separator, sep='\n'),
            split_sentences,
            partial(split_by_separator, sep=' ')
        ]
    
    def split(self, text):
        splits = self._split_recursive(text)
        chunks = self._merge_splits(splits)
        return chunks 

In [None]:
## split text into chunk and embedding of text into vector database

async def add_document_to_vector_db(text_source_name: str):
    text = extract_text_from_pdf(text_source_name)
    doc_name = os.path.splitext(os.path.basename(text_source_name))[0]

    chunks = []
    text_splitter = TextSplitter(chunk_size=512)
    text_chunks = text_splitter.split(text)
    for idx, text_chunk in enumerate(text_chunks):
        chunks.append({
            'text': text_chunk,
            'metadata_': {'doc': doc_name, 'index': idx}
        })

    vectors = await create_embeddings([chunk['text'] for chunk in chunks])

    for chunk, vector in zip(chunks, vectors):
        chunk['vector'] = vector
    
    async with Session() as db:
        for chunk in chunks:
            db.add(Vector(**chunk))
        await db.commit()