# 13. Normalize Numeric Features & Train/Test Split


## 13.1 Import Libraries


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
feature_dataset_path = r"C:\Users\pc\data science\SADOP\data\ml_features.csv"
df_spike_metrics_final = pd.read_csv(feature_dataset_path)
df_spike_metrics_final

Unnamed: 0,query,query_time,rows_examined,joins,has_sum,has_group_by,has_where,tables_count,query_length,cpu_usage,memory_usage,is_slow
0,SUM transactions (simulated missing index),0.115103,250188,1,1,1,0,2,42,11.3,190.453125,1
1,SUM transactions (simulated missing index),0.082442,250188,1,1,1,0,2,42,0.0,190.453125,1
2,SUM transactions (simulated missing index),0.083628,250188,1,1,1,0,2,42,0.0,190.453125,1
3,SUM transactions for user_id=141,0.027882,15,2,1,1,1,3,32,0.0,190.453125,0
4,SUM transactions for user_id=141,0.024800,15,2,1,1,1,3,32,0.0,190.453125,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20995,SUM transactions (simulated missing index),0.082286,250188,1,1,1,0,2,42,0.0,194.011719,1
20996,SUM transactions (simulated missing index),0.082259,250188,1,1,1,0,2,42,0.0,194.011719,1
20997,SUM transactions for user_id=6854,0.026642,9,2,1,1,1,3,33,0.0,194.011719,0
20998,SUM transactions for user_id=6854,0.021574,9,2,1,1,1,3,33,0.0,194.011719,0


## 13.2 Log-Transform query_time


In [3]:
# Avoid log(0) by adding a tiny value
df_spike_metrics_final['query_time_log'] = np.log1p(df_spike_metrics_final['query_time'])
df_spike_metrics_final[['query_time', 'query_time_log']].head()


Unnamed: 0,query_time,query_time_log
0,0.115103,0.108947
1,0.082442,0.079219
2,0.083628,0.080314
3,0.027882,0.027501
4,0.0248,0.024497


## 13.3 Select Features & Targets


In [4]:
# Features
numeric_features = ['rows_examined', 'joins', 'tables_count', 'query_length', 'cpu_usage', 'memory_usage']
categorical_features = ['has_sum', 'has_group_by', 'has_where']

X_numeric = df_spike_metrics_final[numeric_features]
X_categorical = df_spike_metrics_final[categorical_features]

# Concatenate numeric + categorical for ML
X = pd.concat([X_numeric, X_categorical], axis=1)

# Target variables
y_regression = df_spike_metrics_final['query_time_log']  # for regression
y_classification = df_spike_metrics_final['is_slow']    # for classification


## 13.4 Normalize Numeric Features


In [5]:
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Replace numeric columns in X with scaled values
X_scaled = pd.concat([pd.DataFrame(X_numeric_scaled, columns=numeric_features), X_categorical.reset_index(drop=True)], axis=1)
X_scaled.head()


Unnamed: 0,rows_examined,joins,tables_count,query_length,cpu_usage,memory_usage,has_sum,has_group_by,has_where
0,1.581139,-1.581139,-1.581139,1.576946,1.373773,-0.601406,1,1,0
1,1.581139,-1.581139,-1.581139,1.576946,-0.08948,-0.601406,1,1,0
2,1.581139,-1.581139,-1.581139,1.576946,-0.08948,-0.601406,1,1,0
3,-0.632545,0.632456,0.632456,-0.844336,-0.08948,-0.601406,1,1,1
4,-0.632545,0.632456,0.632456,-0.844336,-0.08948,-0.601406,1,1,1


## 13.5 Train/Test Split


In [6]:
# 80% train, 20% test
X_train, X_test, y_train_reg, y_test_reg, y_train_cls, y_test_cls = train_test_split(
    X_scaled, y_regression, y_classification, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (16800, 9)
Test shape: (4200, 9)


## 13.6 Save Preprocessed Datasets 


In [8]:
X_train.to_csv(r"C:\Users\pc\data science\SADOP\data\X_train.csv", index=False)
X_test.to_csv(r"C:\Users\pc\data science\SADOP\data\X_test.csv", index=False)
y_train_reg.to_csv(r"C:\Users\pc\data science\SADOP\data\y_train_reg.csv", index=False)
y_test_reg.to_csv(r"C:\Users\pc\data science\SADOP\data\y_test_reg.csv", index=False)
y_train_cls.to_csv(r"C:\Users\pc\data science\SADOP\data\y_train_cls.csv", index=False)
y_test_cls.to_csv(r"C:\Users\pc\data science\SADOP\data\y_test_cls.csv", index=False)

print("✅ Preprocessed datasets saved for ML")


✅ Preprocessed datasets saved for ML
