# 11. Data Pipeline for AI 

## 11.1 Load Performance Metrics Dataset 

In [69]:
import pandas as pd

metrics_path = r"C:\Users\pc\data science\SADOP\data\slow_query_metrics_final.csv"
df = pd.read_csv(metrics_path)

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (21000, 11)


Unnamed: 0,query,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
0,SUM transactions (simulated missing index),0.115103,250188,1,1,1,0,2,42,11.3,190.453125
1,SUM transactions (simulated missing index),0.082442,250188,1,1,1,0,2,42,0.0,190.453125
2,SUM transactions (simulated missing index),0.083628,250188,1,1,1,0,2,42,0.0,190.453125
3,SUM transactions for user_id=141,0.027882,15,2,1,1,1,3,38,0.0,190.453125
4,SUM transactions for user_id=141,0.0248,15,2,1,1,1,3,38,0.0,190.453125


## 11.2 Inspect Feature Distributions

In [70]:
df.describe()

Unnamed: 0,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage
count,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0
mean,0.046609,71500.254714,1.714286,1.0,1.0,0.714286,2.714286,39.142857,0.69101,191.063795
std,0.031961,113014.744349,0.451765,0.0,0.0,0.451765,0.451765,1.807059,7.722702,1.015428
min,0.018242,5.0,1.0,1.0,1.0,0.0,2.0,38.0,0.0,190.453125
25%,0.024632,18.0,1.0,1.0,1.0,0.0,2.0,38.0,0.0,190.453125
50%,0.028639,32.0,2.0,1.0,1.0,1.0,3.0,38.0,0.0,190.527344
75%,0.078004,250188.0,2.0,1.0,1.0,1.0,3.0,42.0,0.0,191.317383
max,0.22935,250188.0,2.0,1.0,1.0,1.0,3.0,42.0,100.0,194.011719


## 11.3 Define ML Features

In [71]:
df["query_length"] = df["query"].astype(str).apply(len)

# Select features
features = ["rows_examined", "joins", "query_length"]
X = df[features]

# Target variable
y = df["query_time"]

X.head() 

Unnamed: 0,rows_examined,joins,query_length
0,250188,1,42
1,250188,1,42
2,250188,1,42
3,15,2,32
4,15,2,32


In [31]:
y.head()

0    0.030133
1    0.031161
2    0.034487
3    0.076301
4    0.095708
Name: query_time, dtype: float64

## 11.4 Create Performance Labels 

In [72]:
SLOW_THRESHOLD = 0.04
df["is_slow"] = (
    df["query_time"] >= SLOW_THRESHOLD
).astype(int)


## 11.5 Check Class Balance

In [73]:
df["is_slow"].value_counts(normalize=True)

is_slow
0    0.684952
1    0.315048
Name: proportion, dtype: float64

## 11.6 Save Feature Dataset 

In [76]:
feature_dataset_path = r"C:\Users\pc\data science\SADOP\data\ml_features.csv"
df.to_csv(feature_dataset_path, index=False)

print(f"ML-ready dataset saved to {feature_dataset_path}")

ML-ready dataset saved to C:\Users\pc\data science\SADOP\data\ml_features.csv


In [75]:
df.describe()

Unnamed: 0,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage,is_slow
count,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0
mean,0.046609,71500.254714,1.714286,1.0,1.0,0.714286,2.714286,35.487143,0.69101,191.063795,0.315048
std,0.031961,113014.744349,0.451765,0.0,0.0,0.451765,0.451765,4.130142,7.722702,1.015428,0.464546
min,0.018242,5.0,1.0,1.0,1.0,0.0,2.0,30.0,0.0,190.453125,0.0
25%,0.024632,18.0,1.0,1.0,1.0,0.0,2.0,33.0,0.0,190.453125,0.0
50%,0.028639,32.0,2.0,1.0,1.0,1.0,3.0,33.0,0.0,190.527344,0.0
75%,0.078004,250188.0,2.0,1.0,1.0,1.0,3.0,42.0,0.0,191.317383,1.0
max,0.22935,250188.0,2.0,1.0,1.0,1.0,3.0,42.0,100.0,194.011719,1.0
