In [1]:
import pandas as pd
import psycopg2

db_host = "localhost"
db_name = "PersianPoetry"
db_user = "simon"
db_password = "shehr farsi"

try:
    # Connect to PostgreSQL
    conn = psycopg2.connect(host=db_host, database=db_name, user=db_user, password=db_password)
    cursor = conn.cursor()

    masnavi = pd.read_sql(f"SELECT * FROM masnavi;", conn)
    ghazal = pd.read_sql(f"SELECT * FROM ghazal;", conn)
    beyt_vector_df = pd.read_sql(f"SELECT * FROM beyt_vector;", conn)

    cursor.close()
    conn.close()

except psycopg2.Error as e:
    print(f"Error connecting to PostgreSQL: {e}")

  masnavi = pd.read_sql(f"SELECT * FROM masnavi;", conn)
  ghazal = pd.read_sql(f"SELECT * FROM ghazal;", conn)
  beyt_vector_df = pd.read_sql(f"SELECT * FROM beyt_vector;", conn)


In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    multi_process=True,
    encode_kwargs={"normalize_embeddings": True}
)

In [3]:
from langchain_postgres import PGEngine

# Replace these variable values
engine = PGEngine.from_connection_string(url="postgresql+psycopg://simon:shehr farsi@localhost/PersianPoetry")

In [4]:
from langchain_postgres import PGVectorStore

# Initialize PGVectorStore
beyt_store = await PGVectorStore.create(
    engine=engine,
    table_name="beyt_vector",
    embedding_service=embeddings,
    metadata_columns=["type", "book_or_ghazal", "line", "translation"],
)

In [30]:
import numpy as np
import plotly.express as px
import json
import umap
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

embeddings_2d = [
    json.loads(embedding) for embedding in beyt_vector_df.embedding
]

embeddings_2d = np.stack(embeddings_2d).astype(float)
documents_projected = umap.UMAP(n_neighbors=16).fit_transform(embeddings_2d)

In [38]:
df = pd.DataFrame.from_dict(
    [
        {
            "x": documents_projected[i, 0],
            "y": documents_projected[i, 1],
            "source": row_data.type,
            "extract": row_data.content,
            "symbol": "circle",
            "size_col": 1,
        }
        for i, (row, row_data) in enumerate(beyt_vector_df.iterrows())
    ]
)

# Visualize the embedding
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="source",
    hover_data="extract",
    size="size_col",
    symbol="symbol",
    color_discrete_map={"User query": "black"},
    width=1000,
    height=700,
)
fig.update_traces(
    marker=dict(opacity=0.6, line=dict(width=0, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(
    legend_title_text="<b>Chunk source</b>",
    title="<b>2D Projection of Chunk Embeddings via UMAP</b>",
)
fig.show()