In [3]:
# Load and assess the cleaned dataset from Task 4

import pandas as pd
import numpy as np
import os
from datetime import datetime

print("TRAIN/VALIDATION/TEST SPLIT STRATEGY")
print("=" * 60)


# Step 1: Load the cleaned dataset from Task 4
print("\nSTEP 1: LOADING CLEANED DATASET")
print("=" * 40)

# Define path to cleaned dataset
project_root = os.path.abspath('..')
cleaned_dataset_path = os.path.join(project_root, 'data', 'processed', 'cleaned_sql_injection_dataset.csv')

# Load dataset
if os.path.exists(cleaned_dataset_path):
    df_cleaned = pd.read_csv(cleaned_dataset_path)
    print(f"Dataset loaded successfully from: {cleaned_dataset_path}")
    print(f"Dataset shape: {df_cleaned.shape}")
else:
    print(f"Dataset not found at: {cleaned_dataset_path}")
    print("Please ensure Task 4 (Dataset Preprocessing & Cleaning) is completed first")

# Display basic dataset information
if 'df_cleaned' in locals():
    print(f"\nDATASET OVERVIEW:")
    print(f"   Total records: {len(df_cleaned):,}")
    print(f"   Total columns: {len(df_cleaned.columns)}")
    print(f"   Column names: {list(df_cleaned.columns)[:10]}{'...' if len(df_cleaned.columns) > 10 else ''}")
    print(f"   Memory usage: {df_cleaned.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Check for required columns
    if 'query' in df_cleaned.columns and 'label' in df_cleaned.columns:
        print(f"Required columns found: 'query' and 'label'")
        
        # Display class distribution
        print(f"\nCLASS DISTRIBUTION ANALYSIS:")
        class_counts = df_cleaned['label'].value_counts().sort_index()
        total_samples = len(df_cleaned)
        
        for label, count in class_counts.items():
            label_name = "Normal" if label == 0 else "Malicious"
            percentage = (count / total_samples) * 100
            print(f"   {label} ({label_name}): {count:,} samples ({percentage:.1f}%)")
        
        # Calculate balance ratio
        balance_ratio = class_counts.max() / class_counts.min()
        print(f"   Balance ratio: {balance_ratio:.2f}:1")
        
        # Assess balance quality
        if balance_ratio <= 1.2:
            balance_quality = "EXCELLENT"
        elif balance_ratio <= 2.0:
            balance_quality = "GOOD"
        else:
            balance_quality = "NEEDS ATTENTION"
        
        print(f"   Balance quality: {balance_quality}")
        
        # Display sample data
        print(f"\nSAMPLE DATA:")
        print(df_cleaned.head(3))
        
    else:
        print(f"Missing required columns. Found: {list(df_cleaned.columns)}")
        print(f"Expected: 'query' and 'label' columns")


TRAIN/VALIDATION/TEST SPLIT STRATEGY

STEP 1: LOADING CLEANED DATASET
Dataset loaded successfully from: c:\Users\nisha\OneDrive\Desktop\Major-Project\Malicious-Query-detection-and-prevention\data\processed\cleaned_sql_injection_dataset.csv
Dataset shape: (132397, 56)

DATASET OVERVIEW:
   Total records: 132,397
   Total columns: 56
   Column names: ['query', 'label', 'query_length', 'word_count', 'avg_word_length', 'special_char_count', 'special_char_ratio', 'numeric_char_count', 'numeric_char_ratio', 'uppercase_count']...
   Memory usage: 112.5 MB
Required columns found: 'query' and 'label'

CLASS DISTRIBUTION ANALYSIS:
   0 (Normal): 65,656 samples (49.6%)
   1 (Malicious): 66,741 samples (50.4%)
   Balance ratio: 1.02:1
   Balance quality: EXCELLENT

SAMPLE DATA:
                                               query  label  query_length  \
0                      " or pg_sleep ( __TIME__ ) --      1            29   
1  create user name identified by pass123 tempora...      1          

In [6]:
# Plan and implement stratified splitting strategy

from sklearn.model_selection import train_test_split
import numpy as np
import os

print("\nSPLIT STRATEGY PLANNING AND IMPLEMENTATION")
print("=" * 50)

# Define split strategy based on dataset size and balance
import pandas as pd

# Load cleaned dataset into DataFrame
df_cleaned = pd.read_csv(os.path.join(project_root, 'data', 'processed', 'cleaned_sql_injection_dataset.csv'))
total_samples = len(df_cleaned)
print(f"Total samples available: {total_samples:,}")

# Determine optimal split ratios based on dataset size
if total_samples >= 100000:
    # Large dataset: 70% train, 15% val, 15% test
    train_ratio, val_ratio, test_ratio = 0.70, 0.15, 0.15
    split_strategy = "70/15/15 (Large Dataset Strategy)"
elif total_samples >= 10000:
    # Medium dataset: 80% train, 10% val, 10% test  
    train_ratio, val_ratio, test_ratio = 0.80, 0.10, 0.10
    split_strategy = "80/10/10 (Medium Dataset Strategy)"
else:
    # Small dataset: 75% train, 15% val, 10% test
    train_ratio, val_ratio, test_ratio = 0.75, 0.15, 0.10
    split_strategy = "75/15/10 (Small Dataset Strategy)"

print(f"Selected strategy: {split_strategy}")
print(f"Train ratio: {train_ratio:.0%} (~{int(total_samples * train_ratio):,} samples)")
print(f"Validation ratio: {val_ratio:.0%} (~{int(total_samples * val_ratio):,} samples)")
print(f"Test ratio: {test_ratio:.0%} (~{int(total_samples * test_ratio):,} samples)")

# Step 1: Split into train+val and test sets
print(f"\nImplementing stratified splitting...")
X_temp, X_test, y_temp, y_test = train_test_split(
    df_cleaned.drop('label', axis=1),
    df_cleaned['label'],
    test_size=test_ratio,
    random_state=42,
    stratify=df_cleaned['label']
)

# Step 2: Split train+val into train and validation sets
val_size_adjusted = val_ratio / (train_ratio + val_ratio)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=val_size_adjusted,
    random_state=42,
    stratify=y_temp
)

# Add labels back to feature sets for complete datasets
train_set = X_train.copy()
train_set['label'] = y_train
val_set = X_val.copy()
val_set['label'] = y_val
test_set = X_test.copy()
test_set['label'] = y_test

# Display split results
print(f"\nSPLIT RESULTS:")
print(f"Training set: {len(train_set):,} samples ({len(train_set)/total_samples:.1%})")
print(f"Validation set: {len(val_set):,} samples ({len(val_set)/total_samples:.1%})")
print(f"Test set: {len(test_set):,} samples ({len(test_set)/total_samples:.1%})")

# Verify class balance preservation
print(f"\nCLASS BALANCE VERIFICATION:")
sets_info = [
    ("Original", df_cleaned['label']),
    ("Training", train_set['label']),
    ("Validation", val_set['label']),
    ("Test", test_set['label'])
]

for set_name, labels in sets_info:
    counts = labels.value_counts().sort_index()
    normal_pct = (counts[0] / len(labels)) * 100
    malicious_pct = (counts[1] / len(labels)) * 100
    balance = counts.max() / counts.min()
    print(f"{set_name:>12}: Normal {normal_pct:5.1f}% | Malicious {malicious_pct:5.1f}% | Ratio {balance:.2f}:1")




SPLIT STRATEGY PLANNING AND IMPLEMENTATION
Total samples available: 132,397
Selected strategy: 70/15/15 (Large Dataset Strategy)
Train ratio: 70% (~92,677 samples)
Validation ratio: 15% (~19,859 samples)
Test ratio: 15% (~19,859 samples)

Implementing stratified splitting...

SPLIT RESULTS:
Training set: 92,677 samples (70.0%)
Validation set: 19,860 samples (15.0%)
Test set: 19,860 samples (15.0%)

CLASS BALANCE VERIFICATION:
    Original: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1
    Training: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1
  Validation: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1
        Test: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1
