In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("../data/slow_query_metrics_final.csv")
df.head()


Unnamed: 0,query,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
0,SUM transactions (simulated missing index),0.115103,250188,1,1,1,0,2,42,11.3,190.453125
1,SUM transactions (simulated missing index),0.082442,250188,1,1,1,0,2,42,0.0,190.453125
2,SUM transactions (simulated missing index),0.083628,250188,1,1,1,0,2,42,0.0,190.453125
3,SUM transactions for user_id=141,0.027882,15,2,1,1,1,3,38,0.0,190.453125
4,SUM transactions for user_id=141,0.0248,15,2,1,1,1,3,38,0.0,190.453125


In [4]:
np.random.seed(42)

df["explain_type"] = np.where(df["rows_examined"] > 10000, "ALL", "ref")
df["using_filesort"] = (df["query_length"] > 40).astype(int)
df["using_temporary"] = (df["joins"] > 1).astype(int)

df[["explain_type", "using_filesort", "using_temporary"]].head()


Unnamed: 0,explain_type,using_filesort,using_temporary
0,ALL,1,0
1,ALL,1,0
2,ALL,1,0
3,ref,0,1
4,ref,0,1


## Query Risk Classification

Rules based on DBA best practices:

HIGH RISK:
- Full table scan (type = ALL)
- High rows examined
- Filesort or temporary table

MEDIUM RISK:
- Index used but inefficient

LOW RISK:
- Ref/range access with low rows

In [5]:
def classify_risk(row):
    if row["explain_type"] == "ALL" and row["rows_examined"] > 10000:
        return "HIGH"
    if row["using_filesort"] or row["using_temporary"]:
        return "MEDIUM"
    return "LOW"

df["risk_level"] = df.apply(classify_risk, axis=1)
df["risk_level"].value_counts()


risk_level
MEDIUM    15000
HIGH       6000
Name: count, dtype: int64

In [9]:
def recommend_index(row):
    recommendations = []

    if row["risk_level"] == "HIGH":
        if row["has_where"]:
            recommendations.append("Create index on WHERE column(s)")
        if row["joins"] > 0:
            recommendations.append("Create index on JOIN column(s)")

    if row["using_filesort"]:
        recommendations.append("Create index to support ORDER BY / GROUP BY")

    if not recommendations:
        return "No index change recommended"

    return " | ".join(recommendations)

df["index_recommendation"] = df.apply(recommend_index, axis=1)



In [10]:

df[["risk_level", "index_recommendation","query_time"]].head()

Unnamed: 0,risk_level,index_recommendation,query_time
0,HIGH,Create index on JOIN column(s) | Create index ...,0.115103
1,HIGH,Create index on JOIN column(s) | Create index ...,0.082442
2,HIGH,Create index on JOIN column(s) | Create index ...,0.083628
3,MEDIUM,No index change recommended,0.027882
4,MEDIUM,No index change recommended,0.0248


In [11]:
def recommend_index(row):
    recommendations = []

    if row["risk_level"] == "HIGH":
        if row["has_where"]:
            recommendations.append("Create index on WHERE column(s)")
        if row["joins"] > 0:
            recommendations.append("Create index on JOIN column(s)")

    if row["using_filesort"]:
        recommendations.append("Create index to support ORDER BY / GROUP BY")

    if not recommendations:
        return "No index change recommended"

    return " | ".join(recommendations)

df["index_recommendation"] = df.apply(recommend_index, axis=1)
df[["risk_level", "index_recommendation"]].head()


Unnamed: 0,risk_level,index_recommendation
0,HIGH,Create index on JOIN column(s) | Create index ...
1,HIGH,Create index on JOIN column(s) | Create index ...
2,HIGH,Create index on JOIN column(s) | Create index ...
3,MEDIUM,No index change recommended
4,MEDIUM,No index change recommended


## Recommendation Distribution

This shows how often the system suggests index creation.


In [12]:
df["index_recommendation"].value_counts()

index_recommendation
No index change recommended                                                     15000
Create index on JOIN column(s) | Create index to support ORDER BY / GROUP BY     6000
Name: count, dtype: int64

## Simulated Impact of Index Recommendation

We estimate improvement assuming:
- Index reduces execution time by 20â€“40% on HIGH risk queries


In [13]:
def simulate_index_impact(row):
    if row["risk_level"] == "HIGH":
        return row["query_time"] * np.random.uniform(0.6, 0.8)
    return row["query_time"]

df["optimized_query_time"] = df.apply(simulate_index_impact, axis=1)


## Performance Gain Evaluation

In [14]:
df["time_gain"] = df["query_time"] - df["optimized_query_time"]

df.groupby("risk_level")[["query_time", "optimized_query_time", "time_gain"]].mean()


Unnamed: 0_level_0,query_time,optimized_query_time,time_gain
risk_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HIGH,0.094758,0.066261,0.028498
MEDIUM,0.027349,0.027349,0.0
