# Setup database

Can download CSV from https://drive.google.com/uc?id=18BLAZDeH74Ll3b4GsMNY3s-YVnNmWblC&export=download

### Test connection

In [None]:
import pymysql
from pymysql.err import OperationalError

# MySQL connection settings
host = "localhost"
user = "root"
database = "app"
port = 3307

try:
    connection = pymysql.connect(host=host, user=user, database=database, port=port)
    print("Successfully connected to MySQL!")
    connection.close()
except OperationalError as e:
    print(f"Failed to connect to MySQL: {e}")

Successfully connected to MySQL!


### Create DB

In [30]:
import pymysql

# MySQL connection settings (connect to server, not a specific DB)
host = "localhost"
user = "root"
port = 3307

db_name = "playground"

# Connect to MySQL server (no database specified)
connection = pymysql.connect(host=host, user=user, port=port)
cursor = connection.cursor()

# Create database if it does not exist
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{db_name}`;")
print(f"Database '{db_name}' ensured to exist.")

# Close the cursor and connection
cursor.close()
connection.close()

Database 'playground' ensured to exist.


### Create Tables

In [31]:
import pymysql

# MySQL connection settings
host = "localhost"
user = "root"
database = "playground"
port = 3307

# Connect to MySQL
conn = pymysql.connect(host=host, user=user, database=database, port=port)
cur = conn.cursor()

# Common schema (no indexes yet)
base_schema = """
    `Index` INT,
    `Name` VARCHAR(255),
    `Description` TEXT,
    `Brand` VARCHAR(255),
    `Category` VARCHAR(255),
    `Price` FLOAT,
    `Currency` VARCHAR(10),
    `Stock` INT,
    `EAN` VARCHAR(50),
    `Color` VARCHAR(100),
    `Size` VARCHAR(100),
    `Availability` VARCHAR(50),
    `Internal_ID` VARCHAR(255)
"""

# Drop existing tables and create new tables
for table in ["unindexed", "partially_indexed", "fully_indexed"]:
    cur.execute(f"DROP TABLE IF EXISTS {table};")
    cur.execute(f"CREATE TABLE {table} ({base_schema});")

conn.commit()

cur.close()
conn.close()

print("Created tables: unindexed, partially_indexed, fully_indexed")


Created tables: unindexed, partially_indexed, fully_indexed


### Create Indexes

In [32]:
# TODO

### Insert data into tables's

In [33]:
import pymysql
import pandas as pd
import numpy as np
import time

# MySQL connection settings
host = "localhost"
user = "root"
database = "playground"
port = 3307

# Load CSV
csv_file = "products-2000000.csv"
df = pd.read_csv(csv_file)

# Rename Internal ID column for MySQL compatibility
df.rename(columns={"Internal ID": "Internal_ID"}, inplace=True)

# Replace NaN with None
df = df.where(pd.notnull(df), None)

# Convert DataFrame → list of tuples
data = [tuple(row) for row in df.values.tolist()]

# Build insert template with placeholder for table name
columns = ", ".join(f"`{c}`" for c in df.columns)
placeholders = ", ".join(["%s"] * len(df.columns))
insert_sql_template = f"INSERT INTO {{table}} ({columns}) VALUES ({placeholders})"

# Tables to test
tables = ["unindexed", "partially_indexed", "fully_indexed"]

# Connect
conn = pymysql.connect(host=host, user=user, database=database, port=port)
cur = conn.cursor()

batch_size = 10000

for table in tables:
    print(f"\nStarting insert into '{table}'...")
    start_time = time.time()

    for start in range(0, len(data), batch_size):
        end = start + batch_size
        batch = data[start:end]
        cur.executemany(insert_sql_template.format(table=table), batch)
        conn.commit()

    elapsed = time.time() - start_time
    print(f"Finished inserting {len(df)} rows into '{table}' in {elapsed:.2f} seconds.")

cur.close()
conn.close()
print("\nAll tables finished.")



Starting insert into 'unindexed'...
Finished inserting 2000000 rows into 'unindexed' in 25.15 seconds.

Starting insert into 'partially_indexed'...
Finished inserting 2000000 rows into 'partially_indexed' in 25.25 seconds.

Starting insert into 'fully_indexed'...
Finished inserting 2000000 rows into 'fully_indexed' in 26.71 seconds.

All tables finished.
