# 10. Generate Realistic Slow Queries and Metrics

In [1]:
import pandas as pd

# Paths
users_csv = r"C:\Users\pc\data science\SADOP\data\Users.csv"
accounts_csv = r"C:\Users\pc\data science\SADOP\data\Accounts.csv"
transactions_csv = r"C:\Users\pc\data science\SADOP\data\Transactions.csv"
logs_csv = r"C:\Users\pc\data science\SADOP\data\Logs.csv"

# Load data
df_users = pd.read_csv(users_csv)
df_accounts = pd.read_csv(accounts_csv)
df_transactions = pd.read_csv(transactions_csv)
df_logs = pd.read_csv(logs_csv)

print("Tables loaded:")
print("Users:", len(df_users))
print("Accounts:", len(df_accounts))
print("Transactions:", len(df_transactions))
print("Logs:", len(df_logs))


Tables loaded:
Users: 10000
Accounts: 19968
Transactions: 250188
Logs: 79849


## 10.1 Simulate Large Joins and Aggregates


In [2]:
import time
import random

# Already loaded CSVs
# df_users, df_accounts, df_transactions

metrics = []

# Simulate 1000 heavy queries (adjust depending on performance)
for _ in range(1000):
    start_time = time.time()
    
    # Randomly choose a user
    user_id = random.choice(df_users['user_id'].values)
    
    # Large join: Users -> Accounts -> Transactions
    joined = df_users[df_users['user_id'] == user_id] \
             .merge(df_accounts, on='user_id', how='left') \
             .merge(df_transactions, on='account_id', how='left')
    
    # Aggregate example: sum of transactions per account
    agg = joined.groupby('account_id')['amount'].sum()
    
    end_time = time.time()
    exec_time = end_time - start_time
    rows_examined = len(joined)
    
    metrics.append({
        "query": f"SUM transactions for user_id={user_id}",
        "query_time": exec_time,
        "rows_examined": rows_examined,
        "joins": 2  # Users->Accounts->Transactions
    })

df_metrics = pd.DataFrame(metrics)
df_metrics.head()


Unnamed: 0,query,query_time,rows_examined,joins
0,SUM transactions for user_id=1615,0.055942,23,2
1,SUM transactions for user_id=7965,0.026148,21,2
2,SUM transactions for user_id=5778,0.023165,24,2
3,SUM transactions for user_id=7861,0.029534,20,2
4,SUM transactions for user_id=7450,0.023048,13,2


In [3]:
df_metrics.sort_values(by = "query_time",ascending=False)

Unnamed: 0,query,query_time,rows_examined,joins
83,SUM transactions for user_id=5133,0.063969,11,2
0,SUM transactions for user_id=1615,0.055942,23,2
84,SUM transactions for user_id=7806,0.049126,57,2
138,SUM transactions for user_id=8979,0.037707,47,2
135,SUM transactions for user_id=2753,0.037560,29,2
...,...,...,...,...
286,SUM transactions for user_id=8986,0.018635,11,2
292,SUM transactions for user_id=3108,0.018530,19,2
293,SUM transactions for user_id=6719,0.018524,32,2
264,SUM transactions for user_id=2174,0.018214,28,2


## 10.2 Simulate Missing Index Effect


In [None]:
df_transactions_shuffled = df_transactions.sample(frac=1).reset_index(drop=True)

metrics_missing_index = []

for _ in range(2000):
    start_time = time.time()
    
    # Join shuffled Transactions to simulate slow scan
    joined = df_accounts.merge(df_transactions_shuffled, on='account_id', how='left')
    
    agg = joined.groupby('user_id')['amount'].sum()
    
    end_time = time.time()
    exec_time = end_time - start_time
    rows_examined = len(joined)
    
    metrics_missing_index.append({
        "query": "SUM transactions (simulated missing index)",
        "query_time": exec_time,
        "rows_examined": rows_examined,
        "joins": 1
    })

df_metrics_missing = pd.DataFrame(metrics_missing_index)
df_metrics_missing.head()
