# 10. Generate Realistic Slow Queries and Metrics

## 10.1 Import Libraries and Setup

In [33]:

import pandas as pd
import time
import random
import psutil
import os
process = psutil.Process(os.getpid())
# Paths
users_csv = r"C:\Users\pc\data science\SADOP\data\Users.csv"
accounts_csv = r"C:\Users\pc\data science\SADOP\data\Accounts.csv"
transactions_csv = r"C:\Users\pc\data science\SADOP\data\Transactions.csv"
logs_csv = r"C:\Users\pc\data science\SADOP\data\Logs.csv"

# Load data
df_users = pd.read_csv(users_csv)
df_accounts = pd.read_csv(accounts_csv)
df_transactions = pd.read_csv(transactions_csv)
df_logs = pd.read_csv(logs_csv)

print("Tables loaded:")
print("Users:", len(df_users))
print("Accounts:", len(df_accounts))
print("Transactions:", len(df_transactions))
print("Logs:", len(df_logs))


Tables loaded:
Users: 10000
Accounts: 19968
Transactions: 250188
Logs: 79849


## 10.2 Simulate Heavy Queries with Joins and Aggregation


In [42]:
metrics = []

# Simulate 5000 heavy queries
for _ in range(5000):
    start_time = time.time()
    
    user_id = random.choice(df_users['user_id'].values)
    
    joined = df_users[df_users['user_id'] == user_id] \
             .merge(df_accounts, on='user_id', how='left') \
             .merge(df_transactions, on='account_id', how='left')
    
    agg = joined.groupby('account_id')['amount'].sum()
    
    end_time = time.time()
    exec_time = end_time - start_time
    rows_examined = len(joined)
    
    # CPU & memory usage
    cpu = psutil.cpu_percent(interval=None)
    memory = process.memory_info().rss / (1024*1024)
    
    metrics.append({
        "query": f"SUM transactions for user_id={user_id}",
        "query_time": exec_time,
        "rows_examined": rows_examined,
        "joins": 2,
        "has_sum": 1,
        "has_group_by": 1,
        "has_where": 1,
        "tables_count": 3,
        "query_length": 38,
        "cpu_usage": cpu,
        "memory_usage": memory
    })

df_metrics = pd.DataFrame(metrics)


In [36]:
df_metrics.describe()

Unnamed: 0,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,0.022056,25.335667,2.0,1.0,1.0,1.0,3.0,38.0,19.376233,169.29899
std,0.00337,12.078561,0.0,0.0,0.0,0.0,0.0,0.0,11.678005,0.690227
min,0.017999,5.0,2.0,1.0,1.0,1.0,3.0,38.0,6.2,166.429688
25%,0.020002,16.0,2.0,1.0,1.0,1.0,3.0,38.0,12.5,168.813477
50%,0.020999,25.0,2.0,1.0,1.0,1.0,3.0,38.0,15.4,169.373047
75%,0.02256,34.0,2.0,1.0,1.0,1.0,3.0,38.0,25.0,169.860352
max,0.057519,59.0,2.0,1.0,1.0,1.0,3.0,38.0,100.0,170.386719


## 10.3 Simulate Missing Index Effect (Shuffle Transactions)


In [43]:
metrics_missing_index = []

df_transactions_shuffled = df_transactions.sample(frac=1).reset_index(drop=True)

for _ in range(2000):
    start_time = time.time()
    
    joined = df_accounts.merge(df_transactions_shuffled, on='account_id', how='left')
    agg = joined.groupby('user_id')['amount'].sum()
    
    end_time = time.time()
    exec_time = end_time - start_time
    rows_examined = len(joined)
    
    cpu = psutil.cpu_percent(interval=None)
    memory = process.memory_info().rss / (1024*1024)
    
    metrics_missing_index.append({
        "query": "SUM transactions (simulated missing index)",
        "query_time": exec_time,
        "rows_examined": rows_examined,
        "joins": 1,
        "has_sum": 1,
        "has_group_by": 1,
        "has_where": 0,
        "tables_count": 2,
        "query_length": 42,
        "cpu_usage": cpu,
        "memory_usage": memory
    })

df_metrics_missing = pd.DataFrame(metrics_missing_index)


In [37]:
df_metrics_missing.describe()

Unnamed: 0,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.078665,250188.0,1.0,1.0,1.0,0.0,2.0,42.0,20.9772,198.806047
std,0.012106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.071184,0.20074
min,0.067628,250188.0,1.0,1.0,1.0,0.0,2.0,42.0,10.3,198.460938
25%,0.072513,250188.0,1.0,1.0,1.0,0.0,2.0,42.0,14.7,198.631836
50%,0.074645,250188.0,1.0,1.0,1.0,0.0,2.0,42.0,17.9,198.892578
75%,0.079533,250188.0,1.0,1.0,1.0,0.0,2.0,42.0,25.0,198.930664
max,0.269064,250188.0,1.0,1.0,1.0,0.0,2.0,42.0,89.9,199.1875


## 10.4 Combine and Shuffle Datasets

In [44]:
combined_metrics = pd.concat([df_metrics, df_metrics_missing], ignore_index=True)

# Shuffle
combined_metrics = combined_metrics.sample(frac=1, random_state=42).reset_index(drop=True)


In [45]:
spike_metrics = []

for idx, row in combined_metrics.iterrows():
    for _ in range(3):  # simulate 3x load
        exec_time = row['query_time'] * random.uniform(1, 1.5)
        cpu = psutil.cpu_percent(interval=None)
        memory = process.memory_info().rss / (1024*1024)
        
        spike_metrics.append({
            "query": row['query'],
            "query_time": exec_time,
            "rows_examined": row['rows_examined'],
            "joins": row['joins'],
            "has_sum": row['has_sum'],
            "has_group_by": row['has_group_by'],
            "has_where": row['has_where'],
            "tables_count": row['tables_count'],
            "query_length": row['query_length'],
            "cpu_usage": cpu,
            "memory_usage": memory
        })

df_spike_metrics_final = pd.DataFrame(spike_metrics)


In [46]:
df_spike_metrics_final.describe()

Unnamed: 0,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
count,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0
mean,0.046609,71500.254714,1.714286,1.0,1.0,0.714286,2.714286,39.142857,0.69101,191.063795
std,0.031961,113014.744349,0.451765,0.0,0.0,0.451765,0.451765,1.807059,7.722702,1.015428
min,0.018242,5.0,1.0,1.0,1.0,0.0,2.0,38.0,0.0,190.453125
25%,0.024632,18.0,1.0,1.0,1.0,0.0,2.0,38.0,0.0,190.453125
50%,0.028639,32.0,2.0,1.0,1.0,1.0,3.0,38.0,0.0,190.527344
75%,0.078004,250188.0,2.0,1.0,1.0,1.0,3.0,42.0,0.0,191.317383
max,0.22935,250188.0,2.0,1.0,1.0,1.0,3.0,42.0,100.0,194.011719


## 10.6 Save Fully Enhanced Dataset

In [47]:
output_path = r"C:\Users\pc\data science\SADOP\data\slow_query_metrics_final.csv"
df_spike_metrics_final.to_csv(output_path, index=False)
print(f"✅ Fully enhanced metrics saved at:\n{output_path}") 


✅ Fully enhanced metrics saved at:
C:\Users\pc\data science\SADOP\data\slow_query_metrics_final.csv
