# Setup database

Can download CSV from https://drive.google.com/uc?id=18BLAZDeH74Ll3b4GsMNY3s-YVnNmWblC&export=download

#### Config

In [1]:
from db_helper_methods import test_connection, execute_query, execute_many_query

import pymysql
import pandas as pd
import numpy as np
import time

db_name = "playground"

### Test connection

In [2]:
test_connection()

(True, 'Successfully connected to MySQLserver!')

### Create DB

In [3]:

# Create database if it does not exist
execute_query(f"CREATE DATABASE IF NOT EXISTS `{db_name}`;")
print(f"Database '{db_name}' ensured to exist.")

Database 'playground' ensured to exist.


### Create Tables

In [4]:
# Common schema (no indexes yet)
base_schema = """
    `Index` INT,
    `Name` VARCHAR(255),
    `Description` TEXT,
    `Brand` VARCHAR(255),
    `Category` VARCHAR(255),
    `Price` FLOAT,
    `Currency` VARCHAR(10),
    `Stock` INT,
    `EAN` VARCHAR(50),
    `Color` VARCHAR(100),
    `Size` VARCHAR(100),
    `Availability` VARCHAR(50),
    `Internal_ID` VARCHAR(255)
"""

# Drop existing tables and create new tables
for table in ["unindexed", "partially_indexed", "fully_indexed"]:
    execute_query(query=f"DROP TABLE IF EXISTS {table};", database=db_name)
    execute_query(query=f"CREATE TABLE {table} ({base_schema});", database=db_name)

print("Created tables: unindexed, partially_indexed, fully_indexed")

Created tables: unindexed, partially_indexed, fully_indexed


### Create Indexes

In [5]:
# TODO

### Insert data into tables's

In [6]:
# Load CSV
csv_file = "products-2000000.csv"
df = pd.read_csv(csv_file)

# Rename Internal ID column for MySQL compatibility
df.rename(columns={"Internal ID": "Internal_ID"}, inplace=True)

# Replace NaN with None
df = df.where(pd.notnull(df), None)

# Convert DataFrame → list of tuples
data = [tuple(row) for row in df.values.tolist()]

# Build insert template with placeholder for table name
columns = ", ".join(f"`{c}`" for c in df.columns)
placeholders = ", ".join(["%s"] * len(df.columns))
insert_sql_template = f"INSERT INTO {{table}} ({columns}) VALUES ({placeholders})"

# Tables to test
tables = ["unindexed", "partially_indexed", "fully_indexed"]


batch_size = 10000

for table in tables:
    print(f"\nStarting insert into '{table}'...")
    start_time = time.time()

    for start in range(0, len(data), batch_size):
        end = start + batch_size
        batch = data[start:end]
        execute_many_query(insert_sql_template.format(table=table), batch, database=db_name)


    elapsed = time.time() - start_time
    print(f"Finished inserting {len(df)} rows into '{table}' in {elapsed:.2f} seconds.")

print("\nAll tables finished.")



Starting insert into 'unindexed'...
Finished inserting 2000000 rows into 'unindexed' in 26.01 seconds.

Starting insert into 'partially_indexed'...
Finished inserting 2000000 rows into 'partially_indexed' in 26.20 seconds.

Starting insert into 'fully_indexed'...
Finished inserting 2000000 rows into 'fully_indexed' in 26.65 seconds.

All tables finished.
