In [8]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer


In [9]:
# Load your data into a DataFrame

file_name = "labeled_transcript_01.csv"
data = pd.read_csv(f"./data/{file_name}")

# model_name = "paraphrase-multilingual-mpnet-base-v2"
model_name = r"C:\Users\ARM\.cache\torch\sentence_transformers\sentence-transformers_paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name)

# Generate embeddings for your text data
embeddings = model.encode(data["text"].tolist(), convert_to_numpy=True)

In [10]:
# Elasticsearch credentials
username = "elastic"
password = "changeme"

# Establish connection to Elasticsearch with credentials and timeout
es = Elasticsearch(
    ["http://localhost:9200"],
    basic_auth=(username, password),
    request_timeout=30  # Timeout set to 30 seconds
)

In [11]:
# Define index mapping
index_mapping = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "embedding": {"type": "dense_vector", "dims": embeddings.shape[1]},
        }
    }
}

# Create index with the defined mapping
index_name = f"{file_name[:-4]}"
print(index_name)
es.indices.create(index=index_name, body=index_mapping)


# Function to prepare documents for bulk indexing
def prepare_documents():
    for i, row in data.iterrows():
        doc = {
            "_index": index_name,
            "_id": i,
            "text": row["text"],
            "embedding": embeddings[i].tolist(),  # Convert embedding to list
        }
        yield doc


# Bulk index documents
bulk(es, prepare_documents())

# Refresh index
es.indices.refresh(index=index_name)

print("Indexing complete.")

# Refresh index
es.indices.refresh(index=index_name)

print("Indexing complete.")

labeled_transcript_01
Indexing complete.
Indexing complete.


In [15]:
# Search for documents in the index
index = index_name
resp = es.search(index=index, body={"query": {"match_all": {}}})

print("Got {} hits:".format(resp["hits"]["total"]["value"]))
for hit in resp["hits"]["hits"]:
    print(f"{hit['_source']}")

Got 158 hits:
{'text': 'จุดจุดจุด อ่ะก่อนอื่น คนเรือง เลือดสำคัญต่อก็คือ สัปดาห์หน้า เธอ นะครับก็จะน่ามีเทอม ก็คือส่วนที่ไม่เกี่ยวข้องกับภาพนี้ถ้านี้ไม่ออกภาพนี้ให้ออก นี้เป็นตรงนี้ตรงนี้ ไปออก เดี๋ยวเราก็จะตัดนะครับ น้องเองไม่ใช่ ตรงนี้ไปออกนะ ตรงนี้แล้วก็ clip นี้ไปออก ภายนอก', 'embedding': [0.03208367899060249, 0.0882582888007164, -0.016885163262486458, -0.006726228166371584, 0.008562161587178707, 0.040753304958343506, -0.0073640793561935425, 0.06486300379037857, -0.03426193818449974, 0.06902079284191132, 0.10955604165792465, 0.11848893016576767, 0.013003014028072357, -0.00414063036441803, 0.042820606380701065, -0.17998255789279938, -0.05705777928233147, 0.13210858404636383, 0.0791945606470108, 0.0638098195195198, 0.016996972262859344, -0.06342583894729614, 0.03900826722383499, 0.01834099180996418, -0.08574061095714569, -0.058897465467453, 0.05111747980117798, 0.1065465584397316, 0.09439042210578918, 0.06697770953178406, -0.005735447630286217, 0.024857038632035255, -0.03339862078428