# 1. Data Pipeline for AI 

## 1.1 Load Performance Metrics Dataset 

In [10]:
import pandas as pd

metrics_path = r"C:\Users\pc\data science\SADOP\data\slow_query_metrics.csv"
df = pd.read_csv(metrics_path)

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (20000, 10)


Unnamed: 0,query,query_time,rows_returned,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
0,SELECT AVG(total_amount)\nFROM (\n SELECT S...,0.641623,1,1,1,0,2,178,0.0,0.746455
1,"SELECT u.user_id, t.transaction_date, t.amount...",1.644557,250000,0,0,0,3,185,12.5,1.108117
2,SELECT AVG(total_amount)\nFROM (\n SELECT S...,0.650696,1,1,1,0,2,178,0.0,0.763426
3,"SELECT u.user_id, COUNT(t.transaction_id) AS t...",0.620023,16520,0,1,1,3,200,0.0,0.764052
4,SELECT DISTINCT u.user_id\nFROM user u\nJOIN a...,0.347722,16329,0,0,1,3,152,0.0,0.768193


## 1.2 Inspect Feature Distributions

In [31]:
df.describe()

Unnamed: 0,query_time,rows_returned,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,2.460381,42186.36685,0.24755,0.37065,0.5622,2.2546,157.01475,7.91892,1.385129
std,128.639992,75246.922883,0.4316,0.482991,0.496128,0.908251,54.007287,16.800883,0.104171
min,0.0,0.0,0.0,0.0,0.0,1.0,45.0,0.0,0.746455
25%,0.099602,3052.0,0.0,0.0,0.0,1.0,148.0,0.0,1.345806
50%,0.574728,16520.0,0.0,0.0,1.0,2.0,181.0,0.0,1.401846
75%,0.846677,25143.0,0.0,1.0,1.0,3.0,196.0,12.5,1.462074
max,18149.735816,250000.0,1.0,1.0,1.0,4.0,225.0,100.0,1.917593


## 11.3 Define ML Features

In [33]:
# Select features
FEATURE_COLUMNS = [
    "query_time",
    "rows_returned",
    "has_sum",
    "has_group_by",
    "has_where",
    "tables_count",
    "query_length",
    "cpu_usage",
    "memory_usage"
]

X = df[FEATURE_COLUMNS]
X.describe()
# Target variable


Unnamed: 0,query_time,rows_returned,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,2.460381,42186.36685,0.24755,0.37065,0.5622,2.2546,157.01475,7.91892,1.385129
std,128.639992,75246.922883,0.4316,0.482991,0.496128,0.908251,54.007287,16.800883,0.104171
min,0.0,0.0,0.0,0.0,0.0,1.0,45.0,0.0,0.746455
25%,0.099602,3052.0,0.0,0.0,0.0,1.0,148.0,0.0,1.345806
50%,0.574728,16520.0,0.0,0.0,1.0,2.0,181.0,0.0,1.401846
75%,0.846677,25143.0,0.0,1.0,1.0,3.0,196.0,12.5,1.462074
max,18149.735816,250000.0,1.0,1.0,1.0,4.0,225.0,100.0,1.917593


## 11.4 Create Performance Labels 

In [62]:

SLOW_QUERY_THRESHOLD = 1

df["is_slow"] = (df["query_time"] >= SLOW_QUERY_THRESHOLD).astype(int)

y = df["is_slow"]

df[["query_time", "is_slow"]].head()

Unnamed: 0,query_time,is_slow
0,0.641623,0
1,1.644557,1
2,0.650696,0
3,0.620023,0
4,0.347722,0


## 11.5 Check Class Balance

In [63]:
df["is_slow"].value_counts(normalize=True)

is_slow
0    0.7786
1    0.2214
Name: proportion, dtype: float64

In [55]:
df

Unnamed: 0,query,query_time,rows_returned,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage,is_slow
0,SELECT AVG(total_amount)\nFROM (\n SELECT S...,0.641623,1,1,1,0,2,176,0.0,0.746455,0
1,"SELECT u.user_id, t.transaction_date, t.amount...",1.644557,250000,0,0,0,3,183,12.5,1.108117,1
2,SELECT AVG(total_amount)\nFROM (\n SELECT S...,0.650696,1,1,1,0,2,176,0.0,0.763426,0
3,"SELECT u.user_id, COUNT(t.transaction_id) AS t...",0.620023,16520,0,1,1,3,198,0.0,0.764052,0
4,SELECT DISTINCT u.user_id\nFROM user u\nJOIN a...,0.347722,16329,0,0,1,3,150,0.0,0.768193,0
...,...,...,...,...,...,...,...,...,...,...,...
19995,"SELECT u.user_id, SUM(t.amount) AS total_amoun...",0.739858,16520,1,1,0,3,196,0.0,1.499314,1
19996,SELECT *\nFROM user u\nWHERE EXISTS (\n SEL...,0.326831,16437,0,0,1,2,181,0.0,1.499314,0
19997,SELECT *\nFROM user\nWHERE email LIKE '%gmail%',0.013524,0,0,0,1,1,45,0.0,1.499314,0
19998,SELECT user_id FROM user\nUNION\nSELECT user_i...,0.083571,20000,0,0,0,1,59,0.0,1.499314,0


## 11.6 Save Feature Dataset 

In [56]:
feature_dataset_path = r"C:\Users\pc\data science\SADOP\data\ml_features.csv"
df.to_csv(feature_dataset_path, index=False)

print(f"ML-ready dataset saved to {feature_dataset_path}")

ML-ready dataset saved to C:\Users\pc\data science\SADOP\data\ml_features.csv


In [57]:
df.describe()

Unnamed: 0,query_time,rows_returned,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage,is_slow
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,2.460381,42186.36685,0.24755,0.37065,0.5622,2.2546,157.01475,7.91892,1.385129,0.41
std,128.639992,75246.922883,0.4316,0.482991,0.496128,0.908251,54.007287,16.800883,0.104171,0.491846
min,0.0,0.0,0.0,0.0,0.0,1.0,45.0,0.0,0.746455,0.0
25%,0.099602,3052.0,0.0,0.0,0.0,1.0,148.0,0.0,1.345806,0.0
50%,0.574728,16520.0,0.0,0.0,1.0,2.0,181.0,0.0,1.401846,0.0
75%,0.846677,25143.0,0.0,1.0,1.0,3.0,196.0,12.5,1.462074,1.0
max,18149.735816,250000.0,1.0,1.0,1.0,4.0,225.0,100.0,1.917593,1.0
