# 8. Generate Realistic Slow Queries and Metrics

## 8.1 Import Libraries and Setup

In [1]:
import mysql.connector
import pandas as pd
import random
import time
import psutil
import os
process = psutil.Process(os.getpid())


In [2]:
conn = mysql.connector.connect(
    host="127.0.0.1",
    port=3307,
    user="sadop_user",
    password="1234",
    database="SADOP_BDD"
)
cursor = conn.cursor()
print("Connected to SADOP database")

Connected to SADOP database


In [3]:
QUERY_TEMPLATES = [

# ===============================
# BASIC JOIN + WHERE
# ===============================
lambda uid, amt: f"""
SELECT u.user_id, u.full_name, t.amount
FROM user u
JOIN accounts a ON u.user_id = a.user_id
JOIN transactions t ON a.account_id = t.account_id
WHERE u.user_id = {uid}
""",

# ===============================
# JOIN + ORDER BY (filesort)
# ===============================
lambda uid, amt: f"""
SELECT u.user_id, t.transaction_date, t.amount
FROM user u
JOIN accounts a ON u.user_id = a.user_id
JOIN transactions t ON a.account_id = t.account_id
ORDER BY t.transaction_date DESC
""",

# ===============================
# GROUP BY + HAVING
# ===============================
lambda uid, amt: f"""
SELECT u.user_id, SUM(t.amount) AS total_amount
FROM user u
JOIN accounts a ON u.user_id = a.user_id
JOIN transactions t ON a.account_id = t.account_id
GROUP BY u.user_id
HAVING total_amount > {amt}
""",

# ===============================
# GROUP BY account
# ===============================
lambda uid, amt: f"""
SELECT a.account_id, COUNT(*) AS tx_count
FROM accounts a
JOIN transactions t ON a.account_id = t.account_id
GROUP BY a.account_id
HAVING tx_count > 5
""",

# ===============================
# IN subquery (slow)
# ===============================
lambda uid, amt: f"""
SELECT *
FROM user
WHERE user_id IN (
    SELECT user_id
    FROM accounts
    WHERE account_id IN (
        SELECT account_id
        FROM transactions
        WHERE amount > {amt}
    )
)
""",

# ===============================
# EXISTS
# ===============================
lambda uid, amt: f"""
SELECT *
FROM user u
WHERE EXISTS (
    SELECT 1
    FROM accounts a
    JOIN transactions t ON a.account_id = t.account_id
    WHERE a.user_id = u.user_id
      AND t.amount > {amt}
)
""",

# ===============================
# Correlated subquery
# ===============================
lambda uid, amt: f"""
SELECT u.user_id,
       (
         SELECT SUM(t.amount)
         FROM accounts a
         JOIN transactions t ON a.account_id = t.account_id
         WHERE a.user_id = u.user_id
       ) AS total_amount
FROM user u
""",

# ===============================
# COUNT with WHERE
# ===============================
lambda uid, amt: f"""
SELECT u.user_id, COUNT(t.transaction_id) AS tx_count
FROM user u
JOIN accounts a ON u.user_id = a.user_id
JOIN transactions t ON a.account_id = t.account_id
WHERE t.amount > {amt}
GROUP BY u.user_id
""",

# ===============================
# ORDER BY SUM
# ===============================
lambda uid, amt: f"""
SELECT u.user_id, SUM(t.amount) AS total_amount
FROM user u
JOIN accounts a ON u.user_id = a.user_id
JOIN transactions t ON a.account_id = t.account_id
GROUP BY u.user_id
ORDER BY total_amount DESC
""",

# ===============================
# JOIN logs (4-table join)
# ===============================
lambda uid, amt: f"""
SELECT u.user_id, l.log_level, COUNT(*) AS log_count
FROM user u
JOIN logs l ON u.user_id = l.user_id
JOIN accounts a ON u.user_id = a.user_id
JOIN transactions t ON a.account_id = t.account_id
GROUP BY u.user_id, l.log_level
""",

# ===============================
# DISTINCT + JOIN
# ===============================
lambda uid, amt: f"""
SELECT DISTINCT u.user_id
FROM user u
JOIN accounts a ON u.user_id = a.user_id
JOIN transactions t ON a.account_id = t.account_id
WHERE t.amount > {amt}
""",

# ===============================
# LIKE (non-SARGable)
# ===============================
lambda uid, amt: f"""
SELECT *
FROM user
WHERE email LIKE '%gmail%'
""",

# ===============================
# OR condition (index killer)
# ===============================
lambda uid, amt: f"""
SELECT *
FROM transactions
WHERE amount > {amt}
   OR transaction_date < '2025-05-02'
""",

# ===============================
# UNION
# ===============================
lambda uid, amt: f"""
SELECT user_id FROM user
UNION
SELECT user_id FROM accounts
""",

# ===============================
# LEFT JOIN + IS NULL
# ===============================
lambda uid, amt: f"""
SELECT u.user_id
FROM user u
LEFT JOIN accounts a ON u.user_id = a.user_id
WHERE a.account_id IS NULL
""",

# ===============================
# Nested aggregation
# ===============================
lambda uid, amt: f"""
SELECT AVG(total_amount)
FROM (
    SELECT SUM(t.amount) AS total_amount
    FROM accounts a
    JOIN transactions t ON a.account_id = t.account_id
    GROUP BY a.user_id
) sub
""",
]


In [4]:
cursor.execute("SELECT COUNT(*) FROM user")
count = cursor.fetchone()[0]
print("Users in DB:", count)

Users in DB: 20000


## 8.2 Simulate Heavy Queries with Joins and Aggregation


In [None]:
metrics = []

NUM_QUERIES = 20000  # number of queries to run per batch

# Fetch all user_ids once
cursor.execute("SELECT user_id FROM user")
all_user_ids = [row[0] for row in cursor.fetchall()]

# Set realistic amount threshold
cursor.execute("SELECT MIN(amount), MAX(amount) FROM transactions")
min_amt, max_amt = cursor.fetchone()

for _ in range(NUM_QUERIES):
    # pick a random user
    user_id = random.choice(all_user_ids)
    # pick a realistic amount threshold
    amount_threshold = random.randint(int(min_amt), int(max_amt))

    # pick a random query template
    query_sql = random.choice(QUERY_TEMPLATES)(user_id, amount_threshold)

    # ------------------------
    # Execute in MySQL
    # ------------------------
    start_time = time.time()
    cursor.execute(query_sql)
    rows = cursor.fetchall()
    end_time = time.time()
    exec_time = end_time - start_time

    # ------------------------
    # Save metrics
    # ------------------------
    metrics.append({
        "query": query_sql.strip(),
        "query_time": exec_time,
        "rows_returned": len(rows),
        "has_sum": int("SUM" in query_sql.upper()),
        "has_group_by": int("GROUP BY" in query_sql.upper()),
        "has_where": int("WHERE" in query_sql.upper()),
        "tables_count": query_sql.lower().count("join") + 1,
        "query_length": len(query_sql),
        "cpu_usage": psutil.cpu_percent(interval=0.01),
        "memory_usage": process.memory_percent()
    })

# ------------------------
# Convert to DataFrame
# ------------------------
df_metrics = pd.DataFrame(metrics)
output_path = r"C:\Users\pc\data science\SADOP\data\slow_query_metrics.csv"
df_metrics.to_csv(output_path, index=False)
print(f"✅ Fully enhanced metrics saved at:\n{output_path}") 

✅ Fully enhanced metrics saved at:
C:\Users\pc\data science\SADOP\data\slow_query_metrics.csv
