In [None]:
%pip install pymilvus==2.4.13

In [4]:
from pymilvus import connections

try:
    connections.connect(alias="default", host="127.0.0.1", port="19530")
    print("Connection successful!")
except Exception as e:
    print(f"Connection failed: {e}")


Connection successful!


In [31]:
from pymilvus import utility
collections = utility.list_collections()
print("Available collections:", collections)

# for collection_name in collections:
#   utility.drop_collection(collection_name)
# print("All collections have been dropped.")

# collections = utility.list_collections()
# print("Available collections:", collections)

Available collections: ['Yinon_Azulai_25', 'Karine_Elharrar_21', 'Michal_Veldigar_25', 'Israel_Katz_18', 'Miki_Levy_21', 'Aida_Sliman_25', 'Yuli_Edelstein_16', 'Sheshon_Guetta_24', 'Danny_Danon_19', 'Moshe_Abutbul_24', 'Ram_Ben_Barak_21', 'Avigdor_Lieberman_20', 'Pnina_Tamno_25', 'Walid_Al_Huashla_25', 'Karine_Elharrar_22', 'Michael_Biton_23', 'Yosef_Tayeb_25', 'Hili_Trooper_21', 'Miri_Regev_25', 'Eli_Dallal_21', 'Nissim_Wattoori_25', 'Ron_Katz_23', 'Yariv_Levin_25', 'Avi_Dichter_23', 'Aryeh_Deri_23', 'Yevgeny_Sova_25', 'Gadi_Eisenkot_25', 'Zeev_Elkin_21', 'Yair_Lapid_24', 'Zeev_Elkin_22', 'Merav_Cohen_23', 'Shlomo_Karai_25', 'Moshe_Gafni_17', 'Sharon_Haskel_20', 'Merav_Michaeli_24', 'Oded_Forer_23', 'Aryeh_Deri_22', 'Avigdor_Lieberman_17', 'Moshe_Saada_25', 'Yuli_Edelstein_22', 'Michael_Biton_24', 'Yitzhak_Vserloff_25', 'Smicha_Rothman_25', 'Elazar_Stern_22', 'Sheshon_Guetta_25', 'Moshe_Gafni_19', 'Tsega_Maleko_25', 'Zeev_Elkin_18', 'Matan_Kahane_25', 'Miki_Levy_18', 'Orna_Barbivai_22

In [32]:
len(collections)

446

In [None]:
elastic_ip = '34.0.64.248:9200'
kibana_ip = '34.0.64.248:5601'
es_username = 'user'
es_password = 'knesset'

from elasticsearch import Elasticsearch

# es = Elasticsearch(f'http://{elastic_ip}',http_auth=(es_username, es_password), timeout=1000000)
# resp = es.search(index="all_features_sentences", body={"query":{"match_all": {}}})
# print("Got %d Hits:" % resp['hits']['total']['value'])
# for hit in resp['hits']['hits']:
#    print("id: %(sentence_id)s: speaker_name: %(speaker_name)s: sentence_text: %(sentence_text)s" % hit["_source"])


In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch(f'http://{elastic_ip}', basic_auth=(es_username, es_password), request_timeout=500)
data_q =[]
# Query definition
query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"speaker_name": "איתמר בן גביר"}}
            ],
            "filter": {
                "script": {
                    "script": {
                        "source": "doc['sentence_text.keyword'].size() > 0 && doc['sentence_text.keyword'].value.length() > 10"
                    }
                }
            }
        }
    }
}

# Initialize scroll
resp = es.search(index="all_features_sentences", body=query, scroll="2m", size=1000)

# Retrieve the scroll ID and first batch of hits
scroll_id = resp['_scroll_id']
hits = resp['hits']['hits']

total_hits = 0
while total_hits<4000:
    for hit in hits:
      data_q.append("%(sentence_text)s" % hit["_source"])
      #print("%(sentence_text)s" % hit["_source"])

    total_hits += len(hits)

    # Fetch the next batch
    resp = es.scroll(scroll_id=scroll_id, scroll="2m")
    scroll_id = resp['_scroll_id']
    hits = resp['hits']['hits']

print(f"Total results retrieved: {total_hits}")

# Clear the scroll to free resources
es.clear_scroll(scroll_id=scroll_id)


In [None]:
import re

# for i in range(len(data_q)):
#     data_q[i] = re.sub(r'[^א-ת ]', '', data_q[i]).strip()


In [None]:
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
embedding_fn = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Encode the documents
docs = data_q.copy()
vectors = embedding_fn.encode(docs)  # Encode into vectors
print("Dim:", len(vectors[0]), vectors[0].shape)  # Dim: 768


In [None]:
data = [
    {"id": i, "vector": vectors[i], "text": docs[i]}
    for i in range(len(vectors))
]
print("Data has", len(data), "entities, each with fields:", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))


In [None]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection

# Define the collection schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=len(vectors[0])),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512),
]
schema = CollectionSchema(fields, description="Demo collection for quotes of Miri Regev")

# Create or load the collection
collection_name = "Miri_Regev"
collection = Collection(name=collection_name, schema=schema)


In [None]:
# Milvus expects the data in columns (field-wise)
field_data = [
    [entity["id"] for entity in data],  # IDs
    [entity["vector"] for entity in data],  # Vectors
    [entity["text"] for entity in data],  # Text
]

# Insert the data
res = collection.insert(field_data)
print(f"Inserted {len(field_data[0])} entities into {collection_name}.")


In [None]:
# Create an index

index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
collection.create_index(field_name="vector", index_params=index_params)
print("Index created.")


In [None]:
from pymilvus import list_collections

collections = list_collections()
print("Collections in Milvus:", collections)


In [None]:
collection = Collection(name="Miri_Regev_quotes")


In [None]:
collection.load()

# Define a query vector (e.g., embedding for a new query)
query_vector = embedding_fn.encode(["מה דעתה תחבורה ציבורית?"])[0]

# Perform the search
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
results = collection.search(
    data=[query_vector],  # Query vector
    anns_field="vector",
    param=search_params,
    limit=15,  # Number of top matches
    output_fields=["text"],  # Include the text field in results
)

# Display results
for result in results:
    for hit in result:
        print(f"Text: {hit.entity.get('text')}")




In [None]:
def retrive_quotes(KNS_name, knesset_number):
    es = Elasticsearch(f'http://{elastic_ip}', basic_auth=(es_username, es_password), request_timeout=500)
    data_q = []

    query = {
        "query": {
            "bool": {
                "must": [
                    {"match": {"speaker_name": KNS_name}},
                    {"match": {"knesset_number": knesset_number}}
                ],
                "filter": {
                    "script": {
                        "script": {
                            "source": "doc['sentence_text.keyword'].size() > 0 && doc['sentence_text.keyword'].value.length() > 30"
                        }
                    }
                }
            }
        }
    }

    # Initialize scroll
    resp = es.search(index="all_features_sentences", body=query, scroll="2m", size=4000)

    # Retrieve the scroll ID and first batch of hits
    scroll_id = resp['_scroll_id']
    hits = resp['hits']['hits']

    while hits:
        for hit in hits:
            data_q.append("%(sentence_text)s" % hit["_source"])
        # Fetch the next batch
        resp = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = resp['_scroll_id']
        hits = resp['hits']['hits']

    print(f"Total results retrieved: {len(data_q)}")

    # Clear the scroll to free resources
    es.clear_scroll(scroll_id=scroll_id)

    for i in range(len(data_q)):
        data_q[i] = re.sub(r'[^א-ת ]', '', data_q[i]).strip()

    return data_q


In [None]:
retrive_quotes("מירי רגב", "23")

In [None]:
from elasticsearch import Elasticsearch
import re

def retrive_quotes(KNS_name, knesset_number):

    es = Elasticsearch(f'http://{elastic_ip}', basic_auth=(es_username, es_password), request_timeout=500)
    data_q =[]
    # Query definition
    if knesset_number is not None:
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"speaker_name": KNS_name}},
                        {"match": {"knesset_number": knesset_number}}
                    ],
                    "filter": {
                        "script": {
                            "script": {
                                "source": "doc['sentence_text.keyword'].size() > 0 && doc['sentence_text.keyword'].value.length() > 30"
                            }
                        }
                    }
                }
            }
        }
    else:
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"speaker_name": KNS_name}}
                    ],
                    "filter": {
                        "script": {
                            "script": {
                                "source": "doc['sentence_text.keyword'].size() > 0 && doc['sentence_text.keyword'].value.length() > 30"
                            }
                        }
                    }
                }
            }
        }

    # Initialize scroll
    resp = es.search(index="all_features_sentences", body=query, size=8000)
    hits = resp['hits']['hits']

    for hit in hits:
        sentence = hit["_source"].get("sentence_text", "")
        data_q.append(sentence)

    print(f"Total results retrieved: {len(data_q)}")

    # Clean sentences
    for i in range(len(data_q)):
        data_q[i] = re.sub(r'[^א-ת ]', '', data_q[i]).strip()

    return data_q

In [None]:
x = retrive_quotes("משה אבוטבול", 1)

In [None]:
x

In [None]:
import csv

# Write the list to a CSV file
with open('quotes.csv', 'w', newline='', encoding='utf-8') as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(['Quote'])  # Add a header
  for quote in d:
    writer.writerow([quote])

print("List has been written to quotes.csv")