In [5]:
from elasticsearch import Elasticsearch
import pandas as pd
import json
from tqdm import tqdm

# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Function to create the index and insert documents
def create_index_and_insert_docs(corpus_file):
    # Read corpus
    corpus_df = pd.read_csv(corpus_file)
    
    # Create index with appropriate mappings
    index_name = "document_search"
    mapping = {
        "mappings": {
            "properties": {
                "text": {
                    "type": "text",
                    "analyzer": "standard"
                },
                "cid": {
                    "type": "keyword"
                }
            }
        }
    }
    
    # Create index if it doesn't exist
    if not es.indices.exists(index=index_name):
        es.indices.create(index=index_name, body=mapping)
    
    # Insert documents
    for _, row in tqdm(corpus_df.iterrows(), total=len(corpus_df)):
        doc = {
            "text": row['text'],
            "cid": row['cid']
        }
        es.index(index=index_name, document=doc)
    
    # Refresh index
    es.indices.refresh(index=index_name)

# Function to search and create results
def search_and_create_results(queries_file, topk=50):
    # Read queries
    queries_df = pd.read_csv(queries_file)
    
    json_results = []
    
    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
        query = row['question']
        qid = row['qid']
        
        # Search query
        search_body = {
            "size": topk,
            "query": {
                "match": {
                    "text": query
                }
            }
        }
        
        response = es.search(index="document_search", body=search_body)
        
        # Extract results
        top_cids = []
        similarity_scores = []
        
        for hit in response['hits']['hits']:
            top_cids.append(hit['_source']['cid'])
            similarity_scores.append(hit['_score'])
        
        # Create JSON entry
        json_entry = {
            "query_id": qid,
            "candidates": {
                "doc_ids": top_cids,
                "scores": similarity_scores
            }
        }
        
        json_results.append(json_entry)
    
    return json_results

def main():
    # File paths
    corpus_file = "/home/LegalDocumentRetrieval-20241027T111633Z-001/BERT/preprocessed_corpus.csv"
    queries_file = "/home/LegalDocumentRetrieval-20241027T111633Z-001/LegalDocumentRetrieval/public_test.csv"
    output_file = "search_results.json"
    
    # Create index and insert documents
    print("Creating index and inserting documents...")
    create_index_and_insert_docs(corpus_file)
    
    # Perform search and create results
    print("Performing search...")
    results = search_and_create_results(queries_file)
    
    # Save results
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    main()

Creating index and inserting documents...


  if not es.indices.exists(index=index_name):
  es.index(index=index_name, document=doc)
100%|██████████| 261597/261597 [49:06<00:00, 88.78it/s] 
  es.indices.refresh(index=index_name)


Performing search...


  response = es.search(index="document_search", body=search_body)
100%|██████████| 10000/10000 [09:11<00:00, 18.12it/s]


Results saved to search_results.json
