In [1]:
from helpers.db.db_setup_methods import *
from helpers.db.db_query_methods import *

from helpers.db.db_helper_methods import *

init_db(2_000_000)

Created empty database
Created empty tables
Starting to insert data

Starting insert into 'products' (2000000 rows)...
Finished inserting 2000000 rows into 'products' in 38.02s.


I just quickly want to show the difference a single column index can make in a search and when it does not affect anything

First lets compare how it looks if you search for a spesfic name and compare that to how it is different if we have the column indexd or not

In [2]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

query = """
SELECT * FROM products WHERE size = 'Large';
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN " + query, database=get_database_name(), print_as_df=True, show_metrics=False)


[QUERY METRICS] 111336 rows fetched, 747.14 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,ALL,,,,,1979406,10.0,Using where


In [3]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

query = """
CREATE INDEX idx_size ON products(size);
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=False)

query = """
SELECT * FROM products WHERE size = 'Large';
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN "+ query, database=get_database_name(), print_as_df=True, show_metrics=False)

[QUERY METRICS] 111336 rows fetched, 524.16 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,ref,idx_size,idx_size,403,const,223618,100.0,


This is acutally not that huge or a difference i expected so lets try on something that have lets hits.
If there is around 200.000 or under records then it seems that the performance is almost the same where there would be a slight increase in performance if there were a total of 2m records.

In [4]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

query = """
SELECT * FROM products WHERE name = 'Pro Charger Tablet Brush Go 360';
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN "+ query, database=get_database_name(), print_as_df=True, show_metrics=False)

[QUERY METRICS] 1 rows fetched, 797.29 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,ALL,,,,,1979406,10.0,Using where


In [5]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

query = """
CREATE INDEX idx_name ON products(name);
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=False)

query = """
SELECT * FROM products WHERE name = 'Pro Charger Tablet Brush Go 360';
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN "+ query, database=get_database_name(), print_as_df=True, show_metrics=False)

[QUERY METRICS] 1 rows fetched, 1.02 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,ref,idx_name,idx_name,1023,const,1,100.0,


Now this is a huge difference. Even when the data size increases you hit the record in no time since there is only one row affected. 
So i need to keep in mind that a index does not help a lot if you query for something that will hit a lot.
I also think that MySQL will in its execution planner evaluate if a full table scan is more effecient or not.

But this would mean to me atleast that you can always create a index on a column and it will either be faster or as fast as a full table scan never slower unless there is something funky with the execution planner.

Lets try to drive this point home, so lets create a index where the query would select almost everything of the DB and then see if it uses the index or if it uses a full scan

In [6]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

query = """
CREATE INDEX idx_price ON products(price);
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=False)

query = """
SELECT * FROM products WHERE price > 50;
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN "+ query, database=get_database_name(), print_as_df=True, show_metrics=False)

[QUERY METRICS] 1899942 rows fetched, 8640.81 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,ALL,idx_price,,,,1979406,50.0,Using where


Yeah it seems like the MySQL execution planner thinks that 50% of the rows will be affected so it wont use the index. Now lets check when it will hit it

In [7]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

query = """
CREATE INDEX idx_price ON products(price);
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=False)

query = """
SELECT * FROM products WHERE price > 850;
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN "+ query, database=get_database_name(), print_as_df=True, show_metrics=False)

[QUERY METRICS] 298055 rows fetched, 1387.48 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,ALL,idx_price,,,,1979406,30.35,Using where


In [14]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

clear_mysql_cache()

query = """
CREATE INDEX idx_price ON products(price);
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=False)

query = """
SELECT * FROM products WHERE price > 950;
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN "+ query, database=get_database_name(), print_as_df=True, show_metrics=False)

MySQL cache cleared
[QUERY METRICS] 97807 rows fetched, 796.76 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,range,idx_price,idx_price,5,,198934,100.0,Using index condition; Using MRR


In [20]:
drop_non_clustered_indexes("products", get_database_name(), output=False)

clear_mysql_cache()

query = """
CREATE INDEX idx_price ON products(price);
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=False)

query = """
SELECT * FROM products WHERE price > 998;
"""

execute_query(query, database=get_database_name(), print_as_df=False, show_metrics=True)
execute_query("EXPLAIN "+ query, database=get_database_name(), print_as_df=True, show_metrics=False)

MySQL cache cleared
[QUERY METRICS] 2005 rows fetched, 55.86 ms


Unnamed: 0,id,select_type,table,partitions,type,possible_keys,key,key_len,ref,rows,filtered,Extra
0,1,SIMPLE,products,,range,idx_price,idx_price,5,,2005,100.0,Using index condition


It seems like it first happened when the execution planer first thought that 100 percent of the rows hit would be affected by the where clause. That seems a bit odd to me, the difference between is between when the price is over 851 then it use a full scan and over 852 then it uses a range scan

In [9]:
query = """
SELECT avg(price) from products;
"""

execute_query(query, database=get_database_name(), print_as_df=True, show_metrics=True)

Unnamed: 0,avg(price)
0,499.927198


[QUERY METRICS] 1 rows fetched, 208.85 ms
