In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
import os
import sys
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')

print(" TASK 4: DATA PREPROCESSING")

print(" Building EDA-driven preprocessing pipeline...")

# Setup paths
project_root = os.path.abspath('..')
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Load data using our established pipeline
from preprocessing.data_loader import DatasetLoader
config_path = os.path.join(project_root, 'config.json')
loader = DatasetLoader(config_path=config_path)
df = loader.load_sql_injection_dataset()

print(f" Dataset loaded: {df.shape}")
print(f" Columns: {list(df.columns)}")


 TASK 4: DATA PREPROCESSING
 Building EDA-driven preprocessing pipeline...
LOADING SQL INJECTION DATASET
Auto-detected dataset: clean_sql_dataset.csv
 Loading from: c:\Users\nisha\OneDrive\Desktop\Major-Project\Malicious-Query-detection-and-prevention\data\raw\clean_sql_dataset.csv
 Successfully loaded 148,326 records
 Columns: ['query', 'label']
 Dataset loaded: (148326, 2)
 Columns: ['query', 'label']


In [3]:
#Apply EDA-driven data cleaning
print("\n EDA-DRIVEN DATA CLEANING:")

original_count = len(df)
print(f" Starting with: {len(df):,} records")
cleaning_log = []

# Cleaning Operation 1: Handle missing values
print(f"\n1️ Handling missing values...")
if 'query' in df.columns and 'label' in df.columns:
    before_missing = len(df)
    df = df.dropna(subset=['query', 'label']).copy()
    missing_removed = before_missing - len(df)
    
    if missing_removed > 0:
        print(f"    Removed {missing_removed:,} records with missing query/label")
        cleaning_log.append(f"Missing values removed: {missing_removed:,}")
    else:
        print(f"    No missing values in critical columns")

# Cleaning Operation 2: Remove complete duplicates
print(f"\n Removing complete duplicates...")
before_dedup = len(df)
df = df.drop_duplicates().reset_index(drop=True)
complete_dups_removed = before_dedup - len(df)

if complete_dups_removed > 0:
    print(f"    Removed {complete_dups_removed:,} complete duplicate records")
    cleaning_log.append(f"Complete duplicates removed: {complete_dups_removed:,}")
else:
    print(f"    No complete duplicates found")

# Cleaning Operation 3: Remove query duplicates (keep first occurrence)
print(f"\n Removing duplicate queries...")
before_query_dedup = len(df)
df = df.drop_duplicates(subset=['query'], keep='first').reset_index(drop=True)
query_dups_removed = before_query_dedup - len(df)

if query_dups_removed > 0:
    print(f"    Removed {query_dups_removed:,} duplicate queries")
    cleaning_log.append(f"Query duplicates removed: {query_dups_removed:,}")
else:
    print(f"    No duplicate queries found")

# Cleaning Operation 4: EDA Recommendation #7 - Filter very short queries
print(f"\n EDA Recommendation #7: Filtering short queries (<10 chars)...")
if 'query' in df.columns:
    df['query_length'] = df['query'].astype(str).str.len()
    before_filter = len(df)
    
    short_queries = df[df['query_length'] < 10]
    print(f"   Short queries found: {len(short_queries):,}")
    
    # Show examples before removal
    if len(short_queries) > 0:
        print(f"   Examples to be removed:")
        for i, (idx, row) in enumerate(short_queries.head(3).iterrows()):
            query = str(row['query'])
            label_name = "Normal" if row['label'] == 0 else "Malicious"
            print(f"     {i+1}. '{query}' (length: {len(query)}, class: {label_name})")
    
    # Remove short queries
    df = df[df['query_length'] >= 10].copy()
    short_removed = before_filter - len(df)
    
    if short_removed > 0:
        print(f"    Removed {short_removed:,} queries shorter than 10 characters")
        cleaning_log.append(f"Short queries removed: {short_removed:,}")
    else:
        print(f"    No queries shorter than 10 characters")

# Cleaning Operation 5: Handle edge cases
print(f"\n Cleaning edge cases...")
before_edge = len(df)

# Remove queries that are only whitespace after stripping
if 'query' in df.columns:
    df = df[df['query'].astype(str).str.strip().str.len() > 0].copy()
    
    # Standardize whitespace in queries
    df['query'] = df['query'].astype(str).str.strip()
    df['query'] = df['query'].str.replace(r'\s+', ' ', regex=True)

edge_removed = before_edge - len(df)
if edge_removed > 0:
    print(f"    Cleaned {edge_removed:,} edge cases (whitespace-only queries)")
    cleaning_log.append(f"Edge cases cleaned: {edge_removed:,}")
else:
    print(f"    No edge cases found")

# Final cleaning summary
final_count = len(df)
total_removed = original_count - final_count
retention_rate = (final_count / original_count) * 100

print(f"\n CLEANING SUMMARY:")

print(f"   Original dataset: {original_count:,} records")
print(f"   Final dataset: {final_count:,} records")
print(f"   Total removed: {total_removed:,} records ({(total_removed/original_count)*100:.1f}%)")
print(f"   Data retention: {retention_rate:.1f}%")

print(f"\n Cleaning log:")
for i, log_entry in enumerate(cleaning_log, 1):
    print(f"   {i}. {log_entry}")

# Check class balance after cleaning
if 'label' in df.columns:
    print(f"\n Class Distribution (After Cleaning):")
    class_counts_after = df['label'].value_counts().sort_index()
    
    for label, count in class_counts_after.items():
        label_name = "Normal" if label == 0 else "Malicious"
        percentage = (count / len(df)) * 100
        print(f"   • {label} ({label_name}): {count:,} ({percentage:.1f}%)")
    
    balance_ratio_after = class_counts_after.max() / class_counts_after.min()
    print(f"   Balance ratio after cleaning: {balance_ratio_after:.2f}:1")
    
    if balance_ratio_after < 1.5:
        print(f"    Excellent class balance maintained!")
    elif balance_ratio_after < 2.0:
        print(f"    Good class balance maintained")
    else:
        print(f"   Class balance affected by cleaning")

df_cleaned = df.copy()
print(f"\n Data cleaning completed successfully!")



 EDA-DRIVEN DATA CLEANING:
 Starting with: 143,210 records

1️ Handling missing values...
    No missing values in critical columns

 Removing complete duplicates...
    Removed 4 complete duplicate records

 Removing duplicate queries...
    Removed 10,775 duplicate queries

 EDA Recommendation #7: Filtering short queries (<10 chars)...
   Short queries found: 34
   Examples to be removed:
     1. 'or true--' (length: 9, class: Malicious)
     2. 'or 3 = 3' (length: 8, class: Malicious)
     3. 'or '' = '' (length: 9, class: Malicious)
    Removed 34 queries shorter than 10 characters

 Cleaning edge cases...
    No edge cases found

 CLEANING SUMMARY:
   Original dataset: 143,210 records
   Final dataset: 132,397 records
   Total removed: 10,813 records (7.6%)
   Data retention: 92.4%

 Cleaning log:
   1. Complete duplicates removed: 4
   2. Query duplicates removed: 10,775
   3. Short queries removed: 34

 Class Distribution (After Cleaning):
   • 0 (Normal): 65,656 (49.6%)
   • 1