# 13. Normalize Numeric Features & Train/Test Split


## 13.1 Import Libraries


In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
feature_dataset_path = r"C:\Users\pc\data science\SADOP\data\ml_features.csv"
df = pd.read_csv(feature_dataset_path)
df

Unnamed: 0,query,query_time,rows_returned,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage,is_slow
0,SELECT AVG(total_amount)\nFROM (\n SELECT S...,0.641623,1,1,1,0,2,176,0.0,0.746455,0
1,"SELECT u.user_id, t.transaction_date, t.amount...",1.644557,250000,0,0,0,3,183,12.5,1.108117,1
2,SELECT AVG(total_amount)\nFROM (\n SELECT S...,0.650696,1,1,1,0,2,176,0.0,0.763426,0
3,"SELECT u.user_id, COUNT(t.transaction_id) AS t...",0.620023,16520,0,1,1,3,198,0.0,0.764052,0
4,SELECT DISTINCT u.user_id\nFROM user u\nJOIN a...,0.347722,16329,0,0,1,3,150,0.0,0.768193,0
...,...,...,...,...,...,...,...,...,...,...,...
19995,"SELECT u.user_id, SUM(t.amount) AS total_amoun...",0.739858,16520,1,1,0,3,196,0.0,1.499314,1
19996,SELECT *\nFROM user u\nWHERE EXISTS (\n SEL...,0.326831,16437,0,0,1,2,181,0.0,1.499314,0
19997,SELECT *\nFROM user\nWHERE email LIKE '%gmail%',0.013524,0,0,0,1,1,45,0.0,1.499314,0
19998,SELECT user_id FROM user\nUNION\nSELECT user_i...,0.083571,20000,0,0,0,1,59,0.0,1.499314,0


## 3.2 Log-Transform query_time


In [10]:
# Avoid log(0) by adding a tiny value
df['query_time_log'] = np.log1p(df['query_time'])
df[['query_time', 'query_time_log']].describe()


Unnamed: 0,query_time,query_time_log
count,20000.0,20000.0
mean,2.460381,0.558713
std,128.639992,0.651073
min,0.0,0.0
25%,0.099602,0.094948
50%,0.574728,0.454083
75%,0.846677,0.613388
max,18149.735816,9.806466


## 3.3 Select Features & Targets


In [14]:
FEATURES = [
    "rows_returned",
    "tables_count",
    "query_length",
    "has_sum",
    "has_group_by",
    "has_where",
    "cpu_usage",
    "memory_usage"
]

X = df[FEATURES]
y_reg = df["query_time_log"]      # regression target
y_cls = df["is_slow"]             # classification target

## 3.4 Train/Test Split


In [15]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_reg_train, y_reg_test = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

_, _, y_cls_train, y_cls_test = train_test_split(
    X, y_cls, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (16000, 8)
Test size: (4000, 8)


## 3.5 Normalize numeric features

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 3.6 Save Preprocessed Datasets 


In [19]:
pd.DataFrame(X_train_scaled, columns=FEATURES).to_csv("../data/X_train.csv", index=False)
pd.DataFrame(X_test_scaled, columns=FEATURES).to_csv("../data/X_test.csv", index=False)

y_reg_train.to_csv("../data/y_train_reg.csv", index=False)
y_reg_test.to_csv("../data/y_test_reg.csv", index=False)

y_cls_train.to_csv("../data/y_train_cls.csv", index=False)
y_cls_test.to_csv("../data/y_test_cls.csv", index=False)

print("✅ Preprocessed datasets saved")

✅ Preprocessed datasets saved
