In [2]:
import pandas as pd

# Define the input and output file paths
csv_file = r'C:/Users/COOL/sdn_ddos_backend/dataset/Syn.csv'  # Your 637 MB file
parquet_file = r'C:/Users/COOL/sdn_ddos_backend/dataset/Syn.parquet'

print(f"Reading {csv_file} into memory...")
# Read the entire CSV file into a Pandas DataFrame
df = pd.read_csv(csv_file, low_memory=False)

# --- ALTERNATIVE FIX ---
# Convert column to a numeric type. 'coerce' turns any errors into NaN (missing value).
print("Converting 'SimillarHTTP' to numeric, coercing errors...")
df['SimillarHTTP'] = pd.to_numeric(df['SimillarHTTP'], errors='coerce')
# ------------------------

print(f"DataFrame created with {len(df)} rows.")
print(f"Converting to {parquet_file}...")

# Save the DataFrame to a Parquet file
# 'pyarrow' is the engine that handles the conversion
# 'index=False' prevents Pandas from writing the DataFrame index as a column
df.to_parquet(parquet_file, engine='pyarrow', index=False)

print("✅ Conversion complete!")

Reading C:/Users/COOL/sdn_ddos_backend/dataset/Syn.csv into memory...
Converting 'SimillarHTTP' to numeric, coercing errors...
DataFrame created with 1582681 rows.
Converting to C:/Users/COOL/sdn_ddos_backend/dataset/Syn.parquet...
✅ Conversion complete!


In [1]:
import pandas as pd

csv_file = r'C:/Users/COOL/sdn_ddos_backend/dataset/Syn.csv'
final_parquet_file = r'C:/Users/COOL/sdn_ddos_backend/dataset/Syn-optimized.parquet'

print("Reading CSV...")
df = pd.read_csv(csv_file, low_memory=False)

print("Optimizing data types...")

# --- OPTIMIZATION STEPS ---
# 1. Convert low-cardinality string columns to category (add your column names here)
#    First, identify them, then convert them.
#    Example:
#    df['some_column'] = df['some_column'].astype('category')
#    df['another_column'] = df['another_column'].astype('category')
df['SimillarHTTP'] = df['SimillarHTTP'].astype(str).astype('category') # Fix from before + optimize


# 2. Downcast numeric types
for col in df.select_dtypes(include=['int']).columns:
    df[col] = pd.to_numeric(df[col], downcast='integer')
for col in df.select_dtypes(include=['float']).columns:
    df[col] = pd.to_numeric(df[col], downcast='float')

print("\nOriginal memory usage:")
print(df.info(memory_usage='deep'))

# 3. Save with a strong compression algorithm
print(f"\nSaving to {final_parquet_file} with gzip compression...")
df.to_parquet(final_parquet_file, engine='pyarrow', index=False, compression='gzip')

print("✅ Optimized conversion complete!")

Reading CSV...
Optimizing data types...

Original memory usage:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1582681 entries, 0 to 1582680
Data columns (total 88 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   Unnamed: 0                    1582681 non-null  int32   
 1   Flow ID                       1582681 non-null  object  
 2    Source IP                    1582681 non-null  object  
 3    Source Port                  1582681 non-null  int32   
 4    Destination IP               1582681 non-null  object  
 5    Destination Port             1582681 non-null  int32   
 6    Protocol                     1582681 non-null  int8    
 7    Timestamp                    1582681 non-null  object  
 8    Flow Duration                1582681 non-null  int32   
 9    Total Fwd Packets            1582681 non-null  int16   
 10   Total Backward Packets       1582681 non-null  int16   
 11  Total Length

In [3]:
import pandas as pd

# Define the path to your Parquet file
parquet_file = r'C:/Users/COOL/sdn_ddos_backend/dataset/Syn-optimized.parquet'


print(f"Analyzing {parquet_file}...")

try:
    df = pd.read_parquet(parquet_file)

    # --- Basic Data Exploration ---
    print("\n✅ File loaded successfully.")
    print(f"The dataset contains {len(df)} rows and {len(df.columns)} columns.")

    # --- Check for Attack and Benign Labels ---
    # CORRECTED: Use ' Label' with the leading space
    label_column = ' Label'

    if label_column in df.columns:
        print(f"\n--- Analysis of '{label_column}' column ---")
        
        # Count the occurrences of each label
        print("\nDistribution of traffic types:")
        label_counts = df[label_column].value_counts()
        print(label_counts)

        # --- Final Conclusion ---
        print("\n--- Conclusion ---")
        if 'BENIGN' in label_counts.index:
            print(f"✔️ The dataset contains BENIGN traffic ({label_counts['BENIGN']} rows).")
        else:
            print("❌ The dataset does NOT contain BENIGN traffic.")
            
        if 'Syn' in label_counts.index:
            print(f"✔️ The dataset contains Syn (TCP SYN Flood) attacks ({label_counts['Syn']} rows).")
        else:
            print("❌ The dataset does NOT contain Syn attacks.")

    else:
        # This part should not run now, but it's good practice to keep it
        print(f"\n⚠️ Could not find the expected label column.")

except FileNotFoundError:
    print(f"Error: The file '{parquet_file}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Analyzing C:/Users/COOL/sdn_ddos_backend/dataset/Syn-optimized.parquet...

✅ File loaded successfully.
The dataset contains 1582681 rows and 88 columns.

--- Analysis of ' Label' column ---

Distribution of traffic types:
 Label
Syn       1582289
BENIGN        392
Name: count, dtype: int64

--- Conclusion ---
✔️ The dataset contains BENIGN traffic (392 rows).
✔️ The dataset contains Syn (TCP SYN Flood) attacks (1582289 rows).


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# --- 1. Load the Dataset ---
print("Loading the dataset...")
df = pd.read_parquet(r'C:/Users/COOL/sdn_ddos_backend/dataset/Syn-optimized.parquet')

# --- 2. Initial Data Cleaning ---
df.columns = df.columns.str.strip()
print("✅ Column names cleaned.")

# --- THE FINAL FIX: Replace Infinite Values ---
# This is the key step. We find all occurrences of infinity (both positive and negative)
# and replace them with NaN. The imputer will then handle these NaN values.
print("Replacing infinite values with NaN...")
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("✅ Infinite values handled.")


# --- 3. Isolate Numeric Data for Training ---
# Separate features (X) from the target label (y)
X = df.drop('Label', axis=1)
y = df['Label']

# Identify and select only numeric columns
numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
X_numeric = X[numeric_cols]
print(f"✅ Identified {len(numeric_cols)} numeric columns for training.")


# --- 4. Split Data into Training and Test Sets ---
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, stratify=y)
print(f"Original training set distribution: {Counter(y_train)}")


# --- 5. Impute Missing Values (NaN) ---
# This imputer will now handle both original NaNs and the ones we just created from infinite values.
print("Imputing missing (NaN) values using the median strategy...")
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
print("✅ Missing values imputed.")


# --- 6. Apply Faster Balancing Strategy ---
print("Balancing the dataset using the fast hybrid method...")

# Step 1: Undersample the majority class
under = RandomUnderSampler(sampling_strategy={'Syn': 300000}, random_state=42)
X_train_under, y_train_under = under.fit_resample(X_train, y_train)
print(f"After undersampling: {Counter(y_train_under)}")

# Step 2: Run SMOTE on the smaller, undersampled data
over = SMOTE(sampling_strategy={'BENIGN': 300000}, random_state=42)
X_train_balanced, y_train_balanced = over.fit_resample(X_train_under, y_train_under)
print(f"New balanced training set distribution: {Counter(y_train_balanced)}")


# --- 7. Save the Final, Balanced Dataset ---
print("\nSaving the balanced dataset...")

# Convert the NumPy array back to a DataFrame with the correct column names
X_train_balanced_df = pd.DataFrame(X_train_balanced, columns=numeric_cols)
y_train_balanced.reset_index(drop=True, inplace=True)
balanced_df = pd.concat([X_train_balanced_df, y_train_balanced], axis=1)

# Save the final dataset
output_file = r'C:/Users/COOL/sdn_ddos_backend/dataset/Syn-balanced-training-data.parquet'
balanced_df.to_parquet(output_file, index=False)

print(f"🎉 Success! The fully cleaned and balanced training data has been saved to '{output_file}'")

Loading the dataset...
✅ Column names cleaned.
Replacing infinite values with NaN...
✅ Infinite values handled.
✅ Identified 82 numeric columns for training.
Original training set distribution: Counter({'Syn': 1265830, 'BENIGN': 314})
Imputing missing (NaN) values using the median strategy...
✅ Missing values imputed.
Balancing the dataset using the fast hybrid method...
After undersampling: Counter({'Syn': 300000, 'BENIGN': 314})
New balanced training set distribution: Counter({'BENIGN': 300000, 'Syn': 300000})

Saving the balanced dataset...
🎉 Success! The fully cleaned and balanced training data has been saved to 'C:/Users/COOL/sdn_ddos_backend/dataset/Syn-balanced-training-data.parquet'
