In [1]:
from helpers.db.db_setup_methods import *
from helpers.db.db_query_methods import *

from helpers.db.db_helper_methods import *

from helpers.es.es_setup_methods import *
from helpers.es.es_helper_methods import *

records_size = 200_000

db_name_no_index = "db_no_index"
db_name_with_index = "db_with_index"

init_db(records_max=records_size, db_name=db_name_no_index)
init_db(records_max=records_size, db_name=db_name_with_index)

es_standard_index = "standard_index"

init_index(index_name=es_standard_index, documents_records=records_size)

Created empty database
Created empty tables
Starting to insert data

Starting insert into 'products' (2000000 rows)...
Finished inserting 2000000 rows into 'products' in 35.64s.
Created empty database
Created empty tables
Starting to insert data

Starting insert into 'products' (2000000 rows)...
Finished inserting 2000000 rows into 'products' in 35.26s.
Deleted existing index: standard_index
Created index: standard_index
Excluded columns: currency, ean, internal id
Inserting 2000000 docs into 'standard_index'
Inserted 2000000 docs in 52.13 seconds


(True, 'Index standard_index initialized successfully')

First the db benchmark

In [10]:
db_query = """
SELECT * FROM products WHERE name = 'Pro Charger Tablet Brush Go 360'
"""

execute_query(query=db_query, database=db_name_no_index, print_as_df=False, show_metrics=True)

[QUERY METRICS] 1 rows fetched, 664.99 ms


Now creating a index to check the best possible speed

In [3]:
db_query = """
CREATE INDEX idx_products_name ON products (name)
"""

execute_query(query=db_query, database=db_name_with_index, print_as_df=False, show_metrics=False)

In [125]:
db_query = """
SELECT * FROM products WHERE name = 'Pro Charger Tablet Brush Go 360'
"""

execute_query(query=db_query, database=db_name_with_index, print_as_df=False, show_metrics=True)

[QUERY METRICS] 1 rows fetched, 0.72 ms


Now testing out ES

In [145]:
user_input = "Pro Charger Tablet Brush Go 360"

query = {
    "query": {
        "match": {
            "name": user_input
        }
    }
}

search_text(query, es_standard_index)

{'took': 8,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 14.86787,
  'hits': [{'_index': 'standard_index',
    '_id': 'a2gtD5kBS4vSXqN9dxFu',
    '_score': 14.86787,
    '_source': {'index': 1,
     'name': 'Pro Charger Tablet Brush Go 360',
     'description': 'Agent worry research that accept together western.',
     'brand': 'Hawkins PLC',
     'category': 'Home & Kitchen',
     'price': 11,
     'stock': 959,
     'color': 'LightSlateGray',
     'size': '15x20 cm',
     'availability': 'limited_stock'}},
   {'_index': 'standard_index',
    '_id': '4IQuD5kBS4vSXqN9NtnE',
    '_score': 13.694907,
    '_source': {'index': 1886326,
     'name': 'Pro Charger Tablet Brush 360 Go Prime',
     'description': 'Who event like fast thought position.',
     'brand': 'Cline, Middleton and Abbott',
     'category': 'Skincare',
     'price': 739,
     'stock': 556,
     'color': 'B

This seems to acutally be the DB that is faster than ES. I would have thought they would either be around the same speed or that es would be quicker.

Now lets try to only include the first half of the search to see if ES now performs faster. I would expect so. Since the DB cant use the index for a exact match

In [154]:
db_query = """
SELECT * FROM products WHERE name like 'Pro Charger Tablet%%';
"""

execute_query(query=db_query, database=db_name_with_index, print_as_df=False, show_metrics=True)

[QUERY METRICS] 188 rows fetched, 3.24 ms


In [162]:
user_input = "Pro Charger Tablet"

query = {
    "query": {
        "match_phrase": {
            "name": user_input
        }
    }
}

search_text(query, es_standard_index)

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 188, 'relation': 'eq'},
  'max_score': 8.600696,
  'hits': [{'_index': 'standard_index',
    '_id': 'UWwtD5kBS4vSXqN9kQFO',
    '_score': 8.600696,
    '_source': {'index': 258023,
     'name': 'Pro Charger Tablet Brush',
     'description': 'Future role tree may later until one.',
     'brand': 'Grant PLC',
     'category': 'Headphones & Earbuds',
     'price': 626,
     'stock': 170,
     'color': 'LawnGreen',
     'size': 'XL',
     'availability': 'in_stock'}},
   {'_index': 'standard_index',
    '_id': 'I2wtD5kBS4vSXqN9kR3s',
    '_score': 8.600696,
    '_source': {'index': 265145,
     'name': 'Pro Charger Tablet Brush',
     'description': 'Thought year half series appear.',
     'brand': 'Flowers Group',
     'category': 'Haircare',
     'price': 726,
     'stock': 347,
     'color': 'GhostWhite',
     'size': 'XS',
     'availability': 'backorder'

So This yielded about the same result. i did not expect that.

Next lets force the DB to not be able to use the index by doing a wildcard on both sides

In [163]:
db_query = """
SELECT * FROM products WHERE name like '%%Charger Tablet%%';
"""

execute_query(query=db_query, database=db_name_with_index, print_as_df=False, show_metrics=True)

[QUERY METRICS] 3910 rows fetched, 820.53 ms


In [164]:
user_input = "Charger Tablet"

query = {
    "query": {
        "match_phrase": {
            "name": user_input
        }
    }
}

search_text(query, es_standard_index)

{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3910, 'relation': 'eq'},
  'max_score': 6.863493,
  'hits': [{'_index': 'standard_index',
    '_id': '7GgtD5kBS4vSXqN9dxOa',
    '_score': 6.863493,
    '_source': {'index': 642,
     'name': 'Charger Tablet Brush',
     'description': 'Set put much garden program.',
     'brand': 'Montoya Ltd',
     'category': 'Camping & Hiking',
     'price': 922,
     'stock': 837,
     'color': 'LightSeaGreen',
     'size': '50x70 cm',
     'availability': 'in_stock'}},
   {'_index': 'standard_index',
    '_id': 'R2gtD5kBS4vSXqN9dxnS',
    '_score': 6.863493,
    '_source': {'index': 2013,
     'name': 'Charger Tablet Brush',
     'description': 'Manager partner concern establish that.',
     'brand': 'Burns-Holder',
     'category': 'Fragrances',
     'price': 56,
     'stock': 720,
     'color': 'Crimson',
     'size': '10x10 cm',
     'availability': 'in_stock'}},


Now the difference is clear. Es has mentained the exact same speed through out whereas the DB gets slower and slower depnding on either the selectivity or if it has to do a full table scan