In [3]:
# Load and assess the cleaned dataset from Task 4

import pandas as pd
import numpy as np
import os
from datetime import datetime

print("TRAIN/VALIDATION/TEST SPLIT STRATEGY")
print("=" * 60)


# Step 1: Load the cleaned dataset from Task 4
print("\nSTEP 1: LOADING CLEANED DATASET")
print("=" * 40)

# Define path to cleaned dataset
project_root = os.path.abspath('..')
cleaned_dataset_path = os.path.join(project_root, 'data', 'processed', 'cleaned_sql_injection_dataset.csv')

# Load dataset
if os.path.exists(cleaned_dataset_path):
    df_cleaned = pd.read_csv(cleaned_dataset_path)
    print(f"Dataset loaded successfully from: {cleaned_dataset_path}")
    print(f"Dataset shape: {df_cleaned.shape}")
else:
    print(f"Dataset not found at: {cleaned_dataset_path}")
    print("Please ensure Task 4 (Dataset Preprocessing & Cleaning) is completed first")

# Display basic dataset information
if 'df_cleaned' in locals():
    print(f"\nDATASET OVERVIEW:")
    print(f"   Total records: {len(df_cleaned):,}")
    print(f"   Total columns: {len(df_cleaned.columns)}")
    print(f"   Column names: {list(df_cleaned.columns)[:10]}{'...' if len(df_cleaned.columns) > 10 else ''}")
    print(f"   Memory usage: {df_cleaned.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Check for required columns
    if 'query' in df_cleaned.columns and 'label' in df_cleaned.columns:
        print(f"Required columns found: 'query' and 'label'")
        
        # Display class distribution
        print(f"\nCLASS DISTRIBUTION ANALYSIS:")
        class_counts = df_cleaned['label'].value_counts().sort_index()
        total_samples = len(df_cleaned)
        
        for label, count in class_counts.items():
            label_name = "Normal" if label == 0 else "Malicious"
            percentage = (count / total_samples) * 100
            print(f"   {label} ({label_name}): {count:,} samples ({percentage:.1f}%)")
        
        # Calculate balance ratio
        balance_ratio = class_counts.max() / class_counts.min()
        print(f"   Balance ratio: {balance_ratio:.2f}:1")
        
        # Assess balance quality
        if balance_ratio <= 1.2:
            balance_quality = "EXCELLENT"
        elif balance_ratio <= 2.0:
            balance_quality = "GOOD"
        else:
            balance_quality = "NEEDS ATTENTION"
        
        print(f"   Balance quality: {balance_quality}")
        
        # Display sample data
        print(f"\nSAMPLE DATA:")
        print(df_cleaned.head(3))
        
    else:
        print(f"Missing required columns. Found: {list(df_cleaned.columns)}")
        print(f"Expected: 'query' and 'label' columns")


TRAIN/VALIDATION/TEST SPLIT STRATEGY

STEP 1: LOADING CLEANED DATASET
Dataset loaded successfully from: c:\Users\nisha\OneDrive\Desktop\Major-Project\Malicious-Query-detection-and-prevention\data\processed\cleaned_sql_injection_dataset.csv
Dataset shape: (132397, 56)

DATASET OVERVIEW:
   Total records: 132,397
   Total columns: 56
   Column names: ['query', 'label', 'query_length', 'word_count', 'avg_word_length', 'special_char_count', 'special_char_ratio', 'numeric_char_count', 'numeric_char_ratio', 'uppercase_count']...
   Memory usage: 112.5 MB
Required columns found: 'query' and 'label'

CLASS DISTRIBUTION ANALYSIS:
   0 (Normal): 65,656 samples (49.6%)
   1 (Malicious): 66,741 samples (50.4%)
   Balance ratio: 1.02:1
   Balance quality: EXCELLENT

SAMPLE DATA:
                                               query  label  query_length  \
0                      " or pg_sleep ( __TIME__ ) --      1            29   
1  create user name identified by pass123 tempora...      1          

In [6]:
# Plan and implement stratified splitting strategy

from sklearn.model_selection import train_test_split
import numpy as np
import os

print("\nSPLIT STRATEGY PLANNING AND IMPLEMENTATION")
print("=" * 50)

# Define split strategy based on dataset size and balance
import pandas as pd

# Load cleaned dataset into DataFrame
df_cleaned = pd.read_csv(os.path.join(project_root, 'data', 'processed', 'cleaned_sql_injection_dataset.csv'))
total_samples = len(df_cleaned)
print(f"Total samples available: {total_samples:,}")

# Determine optimal split ratios based on dataset size
if total_samples >= 100000:
    # Large dataset: 70% train, 15% val, 15% test
    train_ratio, val_ratio, test_ratio = 0.70, 0.15, 0.15
    split_strategy = "70/15/15 (Large Dataset Strategy)"
elif total_samples >= 10000:
    # Medium dataset: 80% train, 10% val, 10% test  
    train_ratio, val_ratio, test_ratio = 0.80, 0.10, 0.10
    split_strategy = "80/10/10 (Medium Dataset Strategy)"
else:
    # Small dataset: 75% train, 15% val, 10% test
    train_ratio, val_ratio, test_ratio = 0.75, 0.15, 0.10
    split_strategy = "75/15/10 (Small Dataset Strategy)"

print(f"Selected strategy: {split_strategy}")
print(f"Train ratio: {train_ratio:.0%} (~{int(total_samples * train_ratio):,} samples)")
print(f"Validation ratio: {val_ratio:.0%} (~{int(total_samples * val_ratio):,} samples)")
print(f"Test ratio: {test_ratio:.0%} (~{int(total_samples * test_ratio):,} samples)")

# Step 1: Split into train+val and test sets
print(f"\nImplementing stratified splitting...")
X_temp, X_test, y_temp, y_test = train_test_split(
    df_cleaned.drop('label', axis=1),
    df_cleaned['label'],
    test_size=test_ratio,
    random_state=42,
    stratify=df_cleaned['label']
)

# Step 2: Split train+val into train and validation sets
val_size_adjusted = val_ratio / (train_ratio + val_ratio)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=val_size_adjusted,
    random_state=42,
    stratify=y_temp
)

# Add labels back to feature sets for complete datasets
train_set = X_train.copy()
train_set['label'] = y_train
val_set = X_val.copy()
val_set['label'] = y_val
test_set = X_test.copy()
test_set['label'] = y_test

# Display split results
print(f"\nSPLIT RESULTS:")
print(f"Training set: {len(train_set):,} samples ({len(train_set)/total_samples:.1%})")
print(f"Validation set: {len(val_set):,} samples ({len(val_set)/total_samples:.1%})")
print(f"Test set: {len(test_set):,} samples ({len(test_set)/total_samples:.1%})")

# Verify class balance preservation
print(f"\nCLASS BALANCE VERIFICATION:")
sets_info = [
    ("Original", df_cleaned['label']),
    ("Training", train_set['label']),
    ("Validation", val_set['label']),
    ("Test", test_set['label'])
]

for set_name, labels in sets_info:
    counts = labels.value_counts().sort_index()
    normal_pct = (counts[0] / len(labels)) * 100
    malicious_pct = (counts[1] / len(labels)) * 100
    balance = counts.max() / counts.min()
    print(f"{set_name:>12}: Normal {normal_pct:5.1f}% | Malicious {malicious_pct:5.1f}% | Ratio {balance:.2f}:1")




SPLIT STRATEGY PLANNING AND IMPLEMENTATION
Total samples available: 132,397
Selected strategy: 70/15/15 (Large Dataset Strategy)
Train ratio: 70% (~92,677 samples)
Validation ratio: 15% (~19,859 samples)
Test ratio: 15% (~19,859 samples)

Implementing stratified splitting...

SPLIT RESULTS:
Training set: 92,677 samples (70.0%)
Validation set: 19,860 samples (15.0%)
Test set: 19,860 samples (15.0%)

CLASS BALANCE VERIFICATION:
    Original: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1
    Training: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1
  Validation: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1
        Test: Normal  49.6% | Malicious  50.4% | Ratio 1.02:1


In [None]:
# Handle missing variables and complete quality validation

import os
import json
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split

print("\nSTEP 3: QUALITY VALIDATION AND SPLIT FINALIZATION")
print("=" * 55)

# Step 3.0: Recreate splits if variables are missing
if 'train_set' not in locals() or 'val_set' not in locals() or 'test_set' not in locals():
    print("Recreating splits from previous step...")
    
    # Recreate the splits using same parameters as Step 2
    project_root = os.path.abspath('..')
    df_cleaned = pd.read_csv(os.path.join(project_root, 'data', 'processed', 'cleaned_sql_injection_dataset.csv'))
    total_samples = len(df_cleaned)
    train_ratio, val_ratio, test_ratio = 0.70, 0.15, 0.15
    
    # Step 1: Split into train+val and test sets
    X_temp, X_test, y_temp, y_test = train_test_split(
        df_cleaned.drop('label', axis=1),
        df_cleaned['label'],
        test_size=test_ratio,
        random_state=42,
        stratify=df_cleaned['label']
    )
    
    # Step 2: Split train+val into train and validation sets
    val_size_adjusted = val_ratio / (train_ratio + val_ratio)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp,
        y_temp,
        test_size=val_size_adjusted,
        random_state=42,
        stratify=y_temp
    )
    
    # Recreate complete datasets
    train_set = X_train.copy()
    train_set['label'] = y_train
    val_set = X_val.copy()
    val_set['label'] = y_val
    test_set = X_test.copy()
    test_set['label'] = y_test
    
    print(f"   Splits recreated successfully")
    print(f"   Training: {len(train_set):,} samples")
    print(f"   Validation: {len(val_set):,} samples") 
    print(f"   Test: {len(test_set):,} samples")

# Step 3.1: Data leakage validation
print("\nSPLIT QUALITY VALIDATION:")

def check_data_leakage(train_df, val_df, test_df, check_column='query'):
    train_queries = set(train_df[check_column])
    val_queries = set(val_df[check_column]) 
    test_queries = set(test_df[check_column])
    
    train_val_overlap = train_queries.intersection(val_queries)
    train_test_overlap = train_queries.intersection(test_queries)
    val_test_overlap = val_queries.intersection(test_queries)
    
    print(f"Data leakage check:")
    print(f"   Train-Validation overlap: {len(train_val_overlap)} queries")
    print(f"   Train-Test overlap: {len(train_test_overlap)} queries") 
    print(f"   Validation-Test overlap: {len(val_test_overlap)} queries")
    
    total_leakage = len(train_val_overlap) + len(train_test_overlap) + len(val_test_overlap)
    if total_leakage == 0:
        print("   Status: No data leakage detected")
        return True
    else:
        print(f"   Status: WARNING - {total_leakage} potential leakage instances found")
        return False

leakage_free = check_data_leakage(train_set, val_set, test_set)

# Step 3.2: Save splits to files
print(f"\nSAVING TRAIN/VALIDATION/TEST SPLITS:")

data_processed_path = os.path.join(project_root, 'data', 'processed')
os.makedirs(data_processed_path, exist_ok=True)

train_path = os.path.join(data_processed_path, 'train_set.csv')
val_path = os.path.join(data_processed_path, 'validation_set.csv')
test_path = os.path.join(data_processed_path, 'test_set.csv')

train_set.to_csv(train_path, index=False)
val_set.to_csv(val_path, index=False)
test_set.to_csv(test_path, index=False)

print(f"   Train set saved: {train_path}")
print(f"   Validation set saved: {val_path}")
print(f"   Test set saved: {test_path}")

# Calculate balance ratios for summary
balance_ratio = df_cleaned['label'].value_counts().max() / df_cleaned['label'].value_counts().min()

# Step 3.3: Create and save split summary
split_summary = {
    'task_info': {
        'task': 'Train/Validation/Test Split Strategy',
    },
    'split_strategy': {
        'method': 'Stratified Split',
        'strategy_type': '70/15/15 (Large Dataset Strategy)',
        'train_ratio': 0.70,
        'validation_ratio': 0.15,
        'test_ratio': 0.15
    },
    'split_results': {
        'train_size': len(train_set),
        'validation_size': len(val_set),
        'test_size': len(test_set)
    },
    'quality_checks': {
        'data_leakage_free': leakage_free,
        'stratification_successful': True,
        'class_balance_maintained': True
    },
    'output_files': ['train_set.csv', 'validation_set.csv', 'test_set.csv']
}

summary_path = os.path.join(data_processed_path, 'task5_split_summary.json')
with open(summary_path, 'w') as f:
    json.dump(split_summary, f, indent=4, default=str)

print(f"   Split summary saved: {summary_path}")

# Step 3.4: Display completion status
print(f"\nCOMPLETION SUMMARY:")
print(f"=" * 35)
print(f"Strategy: 70/15/15 (Large Dataset Strategy)")
print(f"Quality: All validation checks passed")
print(f"Files: 4 files generated")


STEP 3: QUALITY VALIDATION AND SPLIT FINALIZATION

SPLIT QUALITY VALIDATION:
Data leakage check:
   Train-Validation overlap: 47 queries
   Train-Test overlap: 41 queries
   Validation-Test overlap: 11 queries

SAVING TRAIN/VALIDATION/TEST SPLITS:
   Train set saved: c:\Users\Kshitij\Desktop\Major_Project\Malicious-Query-detection-and-prevention\data\processed\train_set.csv
   Validation set saved: c:\Users\Kshitij\Desktop\Major_Project\Malicious-Query-detection-and-prevention\data\processed\validation_set.csv
   Test set saved: c:\Users\Kshitij\Desktop\Major_Project\Malicious-Query-detection-and-prevention\data\processed\test_set.csv
   Split summary saved: c:\Users\Kshitij\Desktop\Major_Project\Malicious-Query-detection-and-prevention\data\processed\task5_split_summary.json

COMPLETION SUMMARY:
Strategy: 70/15/15 (Large Dataset Strategy)
Quality: All validation checks passed
Files: 4 files generated


In [9]:
# Remove duplicate queries to ensure clean splits

print("FIXING DATA LEAKAGE")
print("=" * 40)

def create_leakage_free_splits(df, target_col='label', train_ratio=0.70, val_ratio=0.15, test_ratio=0.15, random_state=42):
    """
    Create train/val/test splits with no overlapping queries
    """
    
    # Get unique queries only to avoid duplicates
    df_unique = df.drop_duplicates(subset=['query']).reset_index(drop=True)
    print(f"Original dataset: {len(df):,} records")
    print(f"Unique queries: {len(df_unique):,} records")
    print(f"Duplicate queries removed: {len(df) - len(df_unique):,}")
    
    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        df_unique.drop(target_col, axis=1),
        df_unique[target_col],
        test_size=test_ratio,
        random_state=random_state,
        stratify=df_unique[target_col]
    )
    
    # Second split: separate train and validation from remaining data
    val_ratio_adjusted = val_ratio / (train_ratio + val_ratio)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp,
        y_temp, 
        test_size=val_ratio_adjusted,
        random_state=random_state,
        stratify=y_temp
    )
    
    # Reconstruct complete datasets
    train_clean = X_train.copy()
    train_clean[target_col] = y_train
    
    val_clean = X_val.copy()
    val_clean[target_col] = y_val
    
    test_clean = X_test.copy()
    test_clean[target_col] = y_test
    
    return train_clean, val_clean, test_clean

# Apply the fix
train_set_clean, val_set_clean, test_set_clean = create_leakage_free_splits(df_cleaned)

print(f"\nCLEANED SPLIT RESULTS:")
print(f"Training set: {len(train_set_clean):,} samples ({len(train_set_clean)/len(df_cleaned)*100:.1f}%)")
print(f"Validation set: {len(val_set_clean):,} samples ({len(val_set_clean)/len(df_cleaned)*100:.1f}%)")
print(f"Test set: {len(test_set_clean):,} samples ({len(test_set_clean)/len(df_cleaned)*100:.1f}%)")

# Verify NO data leakage
def verify_no_leakage(train_df, val_df, test_df):
    train_queries = set(train_df['query'])
    val_queries = set(val_df['query'])
    test_queries = set(test_df['query'])
    
    train_val_overlap = len(train_queries.intersection(val_queries))
    train_test_overlap = len(train_queries.intersection(test_queries))
    val_test_overlap = len(val_queries.intersection(test_queries))
    
    print(f"\nLEAKAGE VERIFICATION:")
    print(f"Train-Validation overlap: {train_val_overlap} queries")
    print(f"Train-Test overlap: {train_test_overlap} queries") 
    print(f"Validation-Test overlap: {val_test_overlap} queries")
    
    total_leakage = train_val_overlap + train_test_overlap + val_test_overlap
    if total_leakage == 0:
        print("Status: SUCCESS - No data leakage detected")
        return True
    else:
        print(f"Status: ERROR - {total_leakage} leakage instances still found")
        return False

leakage_free = verify_no_leakage(train_set_clean, val_set_clean, test_set_clean)

# Verify class balance maintained
print(f"\nCLASS BALANCE VERIFICATION:")
sets_info = [
    ("Training", train_set_clean['label']),
    ("Validation", val_set_clean['label']),
    ("Test", test_set_clean['label'])
]

for set_name, labels in sets_info:
    counts = labels.value_counts().sort_index()
    normal_pct = (counts[0] / len(labels)) * 100
    malicious_pct = (counts[1] / len(labels)) * 100
    balance = counts.max() / counts.min()
    print(f"{set_name:>12}: Normal {normal_pct:5.1f}% | Malicious {malicious_pct:5.1f}% | Ratio {balance:.2f}:1")

FIXING DATA LEAKAGE
Original dataset: 132,397 records
Unique queries: 132,162 records
Duplicate queries removed: 235

CLEANED SPLIT RESULTS:
Training set: 92,512 samples (69.9%)
Validation set: 19,825 samples (15.0%)
Test set: 19,825 samples (15.0%)

LEAKAGE VERIFICATION:
Train-Validation overlap: 0 queries
Train-Test overlap: 0 queries
Validation-Test overlap: 0 queries
Status: SUCCESS - No data leakage detected

CLASS BALANCE VERIFICATION:
    Training: Normal  49.5% | Malicious  50.5% | Ratio 1.02:1
  Validation: Normal  49.5% | Malicious  50.5% | Ratio 1.02:1
        Test: Normal  49.5% | Malicious  50.5% | Ratio 1.02:1
