### 1. importing dataset

In [3]:
from datasets import load_dataset
# dataset-1: open-phi/programming_books_llama
# dataset-2: squad (question answering)
data = load_dataset('squad')

In [4]:
data['train'][5]

{'id': '5733bf84d058e614000b61be',
 'title': 'University_of_Notre_Dame',
 'context': "As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary's College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, 

### 2. setting up elastic search index

In [7]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200'], basic_auth=('elastic', 'elastic'))
# Define your index name
index_name = "qa_dataset"

# Define the index configuration
index_config = {
  "settings": {
    "analysis": {
      "analyzer": {
        "autocomplete": {
          "tokenizer": "autocomplete",
          "filter": ["lowercase", "stop", "porter_stem"],
          "char_filter": ["html_strip"]
        },
        "standard_analyzer": {
          "type": "standard",
          "stopwords": "_english_"
        }
      },
      "tokenizer": {
        "autocomplete": {
          "type": "edge_ngram",
          "min_gram": 3,
          "max_gram": 10,
          "token_chars": ["letter"]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "autocomplete",
        "search_analyzer": "standard"
      },
      "question": {
        "type": "text",
        "analyzer": "standard_analyzer"
      },
      "context": {
        "type": "text",
        "analyzer": "standard_analyzer"
      }
    }
  }
}

if es.indices.exists(index=index_name):
  es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=index_config)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'qa_dataset'})

### 3. indexing the dataset

In [8]:
from elasticsearch import helpers
bulk_size = 1000
index_limit = -1


def index_documents(dataset, index_name=index_name):
  actions = []
  for idx, doc in enumerate(dataset):
    if idx > index_limit and index_limit > 0:
      break
    try:
      actions.append({
        "_index": index_name,"_id": int(idx),
        "_source": {
          "title": doc["title"],
          "question": doc["question"],
          "context": doc["context"]
        }
      })
      if idx % bulk_size == 0 and idx > 0:
        print(f"Processing bulk: {idx} records")
        helpers.bulk(es, actions)
        actions = []
    except Exception as e:
      print(f"Error processing at key: {idx}")
      print(e)

index_documents(data['train'])

Processing bulk: 1000 records
Processing bulk: 2000 records
Processing bulk: 3000 records
Processing bulk: 4000 records
Processing bulk: 5000 records
Processing bulk: 6000 records
Processing bulk: 7000 records
Processing bulk: 8000 records
Processing bulk: 9000 records
Processing bulk: 10000 records
Processing bulk: 11000 records
Processing bulk: 12000 records
Processing bulk: 13000 records
Processing bulk: 14000 records
Processing bulk: 15000 records
Processing bulk: 16000 records
Processing bulk: 17000 records
Processing bulk: 18000 records
Processing bulk: 19000 records
Processing bulk: 20000 records
Processing bulk: 21000 records
Processing bulk: 22000 records
Processing bulk: 23000 records
Processing bulk: 24000 records
Processing bulk: 25000 records
Processing bulk: 26000 records
Processing bulk: 27000 records
Processing bulk: 28000 records
Processing bulk: 29000 records
Processing bulk: 30000 records
Processing bulk: 31000 records
Processing bulk: 32000 records
Processing bulk: 

In [9]:
question = "optimize code performance in python"

body = {
  "query": {
    "multi_match": {
      "query": question,
      "fields": [
        "title^3",
        "question^2",
        "context"
      ],
      "fuzziness": "AUTO"
    }
  }
}

# Assuming 'es' is your Elasticsearch client and 'index_name' is the name of your index
response = es.search(index=index_name, body=body)
top_hits = response["hits"]["hits"]

for hit in top_hits:
  print(f"topic: {hit['_source']['title']}")
  print(f"markdown: {hit['_source']['context']}")
  print(f"Score: {hit['_score']}\n")

topic: Apollo
markdown: Four days after his birth, Apollo killed the chthonic dragon Python, which lived in Delphi beside the Castalian Spring. This was the spring which emitted vapors that caused the oracle at Delphi to give her prophecies. Hera sent the serpent to hunt Leto to her death across the world. To protect his mother, Apollo begged Hephaestus for a bow and arrows. After receiving them, Apollo cornered Python in the sacred cave at Delphi. Apollo killed Python but had to be punished for it, since Python was a child of Gaia.
Score: 23.287342

topic: Apollo
markdown: Four days after his birth, Apollo killed the chthonic dragon Python, which lived in Delphi beside the Castalian Spring. This was the spring which emitted vapors that caused the oracle at Delphi to give her prophecies. Hera sent the serpent to hunt Leto to her death across the world. To protect his mother, Apollo begged Hephaestus for a bow and arrows. After receiving them, Apollo cornered Python in the sacred cave a