In [12]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
from tqdm.auto import tqdm
import os
from sentence_transformers import SentenceTransformer


In [8]:
# Load your data into a DataFrame

file_name = "labeled_transcript_01.csv"
data = pd.read_csv(f"./data/labeled/{file_name}")

model_name = "paraphrase-multilingual-mpnet-base-v2"
# model_name = r"C:\Users\ARM\.cache\torch\sentence_transformers\sentence-transformers_paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name)

# Generate embeddings for your text data
embeddings = model.encode(data["text"].tolist(), convert_to_numpy=True)

In [9]:
# Elasticsearch credentials
username = "elastic"
password = "changeme"

# Establish connection to Elasticsearch with credentials and timeout
es = Elasticsearch(
    ["http://localhost:9200"],
    basic_auth=(username, password),
    request_timeout=30  # Timeout set to 30 seconds
)

In [10]:
es.info()

ObjectApiResponse({'name': 'elasticsearch', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'zOK1mryoTaWg-9ifIohEsg', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [15]:
import re
def get_id_from_file(file):
    match = re.search(r'labeled_transcript_(\d+)', file)
    if match:
        number = match.group(1)
        return number
    return None

In [19]:
for file in os.listdir(f"./data/labeled"):
    print(file)
    if "transcript" not in file:
        continue
    filename = f"./data/labeled/{file}"
    df = pd.read_csv(filename, index_col=0)
    
    file_id = get_id_from_file(filename)

    corpus_embeddings = df["text"]
    start_time = df["start_time"]
    stop_time = df["stop_time"]
    corpus_embeddings = model.encode(
        corpus_embeddings, convert_to_tensor=False, show_progress_bar=True
    )

    for i in range(len(df)):
        doc = {
            "text": df["text"][i],
            "start_time": df["start_time"][i],
            "stop_time": df["stop_time"][i],
            "embedding": corpus_embeddings[i].tolist()  # Convert numpy array to list
        }
        # Index document into Elasticsearch
        res = es.index(index=f"transcript_{file_id}", body=doc)
        # print(res)

file_query_map.csv
labeled_transcript_01.csv


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

labeled_transcript_02.csv


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

labeled_transcript_03.csv


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

labeled_transcript_04.csv


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

labeled_transcript_05.csv


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

labeled_transcript_06.csv


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

labeled_transcript_07.csv


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

labeled_transcript_08.csv


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

labeled_transcript_09.csv


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

query_map.csv


In [22]:
file_query_map_df = pd.read_csv("./data/labeled/file_query_map.csv")
query_dict = dict()
for file in os.listdir(f"./data/labeled"):
    if "transcript" not in file:
        continue
    filename = f"./data/labeled/{file}"
    file_id = get_id_from_file(filename)
    query_dict[file_id] = list()

    for query in file_query_map_df.loc[
        file_query_map_df.file == file, "query"
    ].tolist():
        query_dict[file_id].append(query)

query_dict

{'01': ['linear program',
  'Assumption of linear program',
  'standard form of linear program',
  'basic feasible solution',
  'direction of unboundedness',
  'optimization เอาไปใช้ทำอะไร',
  'ประเภทของปัญหา optimization แบบ dynamic กับ static ต่างกันยังไง',
  'ทำไม feasible region ของ linear program ถึงเป็น convex set ทั้งหมด',
  'จะเปลี่ยน constraints เป็น standard form ยังไง',
  'direction of unboundedness หายังไง'],
 '02': ['homework',
  'MLE properties',
  'covariance',
  'scipy function',
  'Confidence interval',
  'ทำไมสามารถหา argmax โดยใช้ log ได้',
  'สุ่มตัวแปรจาก distribution ที่ไม่เป็นมาตรฐานได้อย่างไร',
  'MLE เป็นมุมมองของ Frequentist หรือ Bayesian',
  'สิ่งใดทำให้ผลของ bayesian estimate เปลี่ยน',
  'MLE จะเท่ากับ MAP เมื่อใด'],
 '03': ['Reject Region',
  'power',
  'Central limit theorem',
  'Type of error',
  'Exercise',
  'เราควรตั้ง Null Hypothesis อย่างไร',
  "student's t distribution กับ standard gaussian ต่างกันอย่างไร",
  'เราควรกำหนด sample size อย่างไร',
  'P-

In [None]:
import time
search_time = []
for id, query in query_dict.items():
    query_embedding = model.encode(query, convert_to_tensor=False)
    query = {
            "knn": {
                "field": "embeddings",
                "query_vector": query_embedding,
                "num_candidates": 20,
                "filter": {
                    "term" : { "clip_id" : id }
                }
            }
        }
    start_time = time.time()
    resp = client.search(index="transcripts",size=10, query=query)
    end_time = time.time()
    search_time.append(end_time - start_time)
    # for hit in resp['hits']['hits']:
    #     print(hit["_source"]["text"])