# Elastic search config

### Test connection

In [1]:
import requests
from requests.auth import HTTPBasicAuth

# Elasticsearch connection settings
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"

try:
    response = requests.get(f"{es_url}/", auth=HTTPBasicAuth(es_user, es_password), timeout=5)
    if response.status_code == 200:
        print("Successfully connected to Elasticsearch!")
    else:
        print(f"Failed to connect to Elasticsearch: {response.status_code} {response.text}")
except Exception as e:
    print(f"Error connecting to Elasticsearch: {e}")

Successfully connected to Elasticsearch!




### Create empty Index

In [None]:
from elasticsearch import Elasticsearch

# Connection
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"
index_name = "my_index"

es = Elasticsearch(
    es_url,
    basic_auth=(es_user, es_password)
)

# 1. Delete index if it already exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Deleted old index: {index_name}")

# 2. Define settings + mappings
settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "ngram_analyzer": {
                    "tokenizer": "ngram_tokenizer"
                }
            },
            "tokenizer": {
                "ngram_tokenizer": {
                    "type": "ngram",
                    "min_gram": 3,
                    "max_gram": 4,
                    "token_chars": ["letter", "digit"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "fields": {
                    "standard": {"type": "text", "analyzer": "standard"},
                    "ngram": {"type": "text", "analyzer": "ngram_analyzer"}
                }
            },
            "year": {"type": "integer"},
            "genre": {"type": "keyword"}
        }
    }
}


# 3. Create index
es.indices.create(index=index_name, body=settings)
print(f"Created index: {index_name}")


Deleted old index: imdb_unindexed
Created index: imdb_unindexed


### Fill index

In [None]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd
import numpy as np
import time

# Elasticsearch connection
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"
index_name = "my_index"

es = Elasticsearch(es_url, basic_auth=(es_user, es_password))

# Load CSV
csv_file = "products-2000000.csv"  # or larger CSV
df = pd.read_csv(csv_file)

# Replace NaN with None
df = df.where(pd.notnull(df), None)

# Rename columns if necessary (match ES mapping)
# For example: 'Title', 'Year', 'Genre'
# Make sure the column names match the mappings
# df.rename(columns={"OldName": "NewName"}, inplace=True)

# Convert DataFrame to dicts for ES
records = df.to_dict(orient="records")

batch_size = 1000  # tweak based on memory / ES capacity
start_time = time.time()

print("Starting to insert rows")

for start in range(0, len(records), batch_size):
    end = start + batch_size
    batch = records[start:end]
    actions = [
        {
            "_index": index_name,
            "_source": rec
        }
        for rec in batch
    ]
    helpers.bulk(es, actions)

elapsed = time.time() - start_time
print(f"Finished inserting {len(records)} rows into '{index_name}' in {elapsed:.2f} seconds")


Inserted rows 0–1000
Inserted rows 1000–2000
Inserted rows 2000–3000
Inserted rows 3000–4000
Inserted rows 4000–5000
Inserted rows 5000–6000
Inserted rows 6000–7000
Inserted rows 7000–8000
Inserted rows 8000–9000
Inserted rows 9000–10000
Inserted rows 10000–11000
Inserted rows 11000–12000
Inserted rows 12000–13000
Inserted rows 13000–14000
Inserted rows 14000–15000
Inserted rows 15000–16000
Inserted rows 16000–17000
Inserted rows 17000–18000
Inserted rows 18000–19000
Inserted rows 19000–20000
Inserted rows 20000–21000
Inserted rows 21000–22000
Inserted rows 22000–23000
Inserted rows 23000–24000
Inserted rows 24000–25000
Inserted rows 25000–26000
Inserted rows 26000–27000
Inserted rows 27000–28000
Inserted rows 28000–29000
Inserted rows 29000–30000
Inserted rows 30000–31000
Inserted rows 31000–32000
Inserted rows 32000–33000
Inserted rows 33000–34000
Inserted rows 34000–35000
Inserted rows 35000–36000
Inserted rows 36000–37000
Inserted rows 37000–38000
Inserted rows 38000–39000
Inserted