## Imports & Configuratio

In [1]:
import mysql.connector
import pandas as pd
import numpy as np
import json

## MySQL Connection

In [2]:
import mysql.connector
import pandas as pd

conn = mysql.connector.connect(
    host="127.0.0.1",
    port=3307,
    user="sadop_user",
    password="1234",
    database="sadop_DB"
)
cursor = conn.cursor(dictionary=True)
print("Connected to SADOP database")

Connected to SADOP database


## Load Queries to Analyze

In [4]:
df_queries = pd.read_csv("../data/slow_query_metrics_final.csv")

queries = df_queries["query"].dropna().unique().tolist()

print(f"Loaded {len(queries)} unique queries for EXPLAIN analysis")


Loaded 3909 unique queries for EXPLAIN analysis


## Function: Run EXPLAIN

In [5]:
def run_explain(query):
    try:
        cursor.execute(f"EXPLAIN {query}")
        result = cursor.fetchall()
        return result
    except Exception as e:
        return None


## Function: Extract Plan Features

In [6]:
def extract_explain_features(explain_rows):
    features = {
        "full_table_scan": 0,
        "using_filesort": 0,
        "using_temporary": 0,
        "estimated_rows": 0,
        "join_type_all": 0
    }

    if explain_rows is None:
        return features

    for row in explain_rows:
        if row.get("type") == "ALL":
            features["full_table_scan"] = 1
            features["join_type_all"] = 1

        extra = row.get("Extra", "")
        if extra:
            if "Using filesort" in extra:
                features["using_filesort"] = 1
            if "Using temporary" in extra:
                features["using_temporary"] = 1

        features["estimated_rows"] += row.get("rows", 0)

    return features


## Run EXPLAIN on All Queries

In [7]:
explain_features = []

for q in queries:
    explain_result = run_explain(q)
    features = extract_explain_features(explain_result)
    features["query"] = q
    explain_features.append(features)

df_explain = pd.DataFrame(explain_features)
df_explain.head()


Unnamed: 0,full_table_scan,using_filesort,using_temporary,estimated_rows,join_type_all,query
0,0,0,0,0,0,SUM transactions (simulated missing index)
1,0,0,0,0,0,SUM transactions for user_id=141
2,0,0,0,0,0,SUM transactions for user_id=3496
3,0,0,0,0,0,SUM transactions for user_id=4774
4,0,0,0,0,0,SUM transactions for user_id=7613


In [8]:
df_explain.describe()

Unnamed: 0,full_table_scan,using_filesort,using_temporary,estimated_rows,join_type_all
count,3909.0,3909.0,3909.0,3909.0,3909.0
mean,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0


## Define Execution Risk Level

In [None]:
def classify_risk(row):
    if row["full_table_scan"] == 1 or row["using_filesort"] == 1:
        return "HIGH"
    if row["using_temporary"] == 1:
        return "MEDIUM"
    return "LOW"

df_explain["execution_risk"] = df_explain.apply(classify_risk, axis=1)
df_explain["execution_risk"].value_counts()


In [None]:
df_merged = df_queries.merge(
    df_explain,
    on="query",
    how="left"
)

df_merged.head()


## Save Execution Plan Features

In [None]:
output_path = "../data/explain_features.csv"
df_merged.to_csv(output_path, index=False)

print(f"EXPLAIN features saved to {output_path}")


## Summary

In this notebook we:

- Executed MySQL EXPLAIN on realistic workload queries
- Parsed execution plans into structured signals
- Detected:
  - Full table scans
  - Temporary tables
  - Filesort usage
- Classified execution risk (LOW / MEDIUM / HIGH)
- Generated features usable by:
  - ML Diagnostic Engine
  - RL Index Optimization Agent

This notebook provides the **execution intelligence layer**
for automated database optimization.
