In [None]:
pip install graphviz

In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
import pickle
warnings.filterwarnings('ignore')
print("Libraries imported successfully!")

In [None]:
# Cell 2: Load and explore data
def load_data(file_path, sample_size=100000):
    """
    Load the TII-SSRC-23 Dataset
    
    Args:
        file_path: Path to the dataset CSV file
        sample_size: Number of rows to sample for analysis (default: 100000)
        
    Returns:
        df: Loaded DataFrame (sampled)
    """
    print(f"Loading dataset from {file_path}...")
    # Get total number of rows first (without loading entire file)
    total_rows = sum(1 for _ in open(file_path, 'r')) - 1  # subtract header
    print(f"Total rows in dataset: {total_rows}")
    
    # Calculate skip rows to get a random sample
    import random
    if total_rows > sample_size:
        skip_rate = max(1, total_rows // sample_size)
        # Skip rows but always keep header (row 0)
        skip_rows = [i for i in range(1, total_rows + 1) if i % skip_rate != 0]
        print(f"Using a sample of approximately {sample_size} rows...")
        df = pd.read_csv(file_path, skiprows=skip_rows)
    else:
        df = pd.read_csv(file_path)
    
    print(f"Dataset loaded successfully with shape: {df.shape}")
    return df

def explore_data(df):
    """
    Explore the dataset and print useful information
    
    Args:
        df: DataFrame to explore
    """
    print("\n===== DATASET EXPLORATION =====")
    
    print(f"\nDataset shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nData types:")
    print(df.dtypes)
    
    print("\nSummary statistics of numerical features:")
    # Only compute statistics for numeric columns to save memory
    numeric_cols = df.select_dtypes(include=['number']).columns
    print(df[numeric_cols].describe())
    
    print("\nTraffic label distribution:")
    label_dist = df['Label'].value_counts()
    print(label_dist)
    
    print("\nTraffic Type distribution:")
    type_dist = df['Traffic Type'].value_counts()
    print(type_dist)
    
    print("\nTraffic Subtype distribution:")
    subtype_dist = df['Traffic Subtype'].value_counts()
    print(subtype_dist)
    
    # Plotting class distribution
    plt.figure(figsize=(12, 6))
    label_dist.plot(kind='bar', color='skyblue')
    plt.title('Label Distribution')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Plotting Traffic Type distribution
    plt.figure(figsize=(14, 6))
    type_dist.plot(kind='bar', color='lightgreen')
    plt.title('Traffic Type Distribution')
    plt.xlabel('Traffic Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Check for missing values (selective columns to save memory)
    print("\nChecking for missing values in key columns...")
    key_columns = ['Label', 'Traffic Type', 'Traffic Subtype'] + numeric_cols[:20].tolist()  # First 20 numeric columns
    missing_values = df[key_columns].isnull().sum()
    if missing_values.sum() > 0:
        print("\nMissing values in selected columns:")
        print(missing_values[missing_values > 0])
    else:
        print("\nNo missing values found in the selected columns.")

# Load the dataset with sampling
file_path = "C:\\Users\\vansh\\Downloads\\INTEL-CIC-DIS-2017-18-main\\data.csv"
df = load_data(file_path)
explore_data(df)

In [None]:
# Cell 3: Preprocess data
print("Preprocessing data...")

# Make a copy of the dataframe to avoid modifying the original
df_cleaned = df.copy()

# Handle missing values
print(f"Missing values before cleaning: {df_cleaned.isnull().sum().sum()}")
df_cleaned = df_cleaned.fillna(0)
print(f"Missing values after cleaning: {df_cleaned.isnull().sum().sum()}")

# Handle infinite values
# Process numeric columns only to save memory
numeric_cols = df_cleaned.select_dtypes(include=['number']).columns
for col in numeric_cols:
    df_cleaned[col] = df_cleaned[col].replace([np.inf, -np.inf], np.nan).fillna(0)

# Remove leading and trailing spaces from column names
df_cleaned.columns = df_cleaned.columns.str.strip()

# Encode the target variables
label_encoder = LabelEncoder()
df_cleaned['Label_Encoded'] = label_encoder.fit_transform(df_cleaned['Label'])

traffic_type_encoder = LabelEncoder()
df_cleaned['Traffic_Type_Encoded'] = traffic_type_encoder.fit_transform(df_cleaned['Traffic Type'])

traffic_subtype_encoder = LabelEncoder()
df_cleaned['Traffic_Subtype_Encoded'] = traffic_subtype_encoder.fit_transform(df_cleaned['Traffic Subtype'])

# Print the label mapping for reference
print("\nLabel mapping:")
for label, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{label} -> {code}")

print("\nTraffic Type mapping:")
for label, code in zip(traffic_type_encoder.classes_, traffic_type_encoder.transform(traffic_type_encoder.classes_)):
    print(f"{label} -> {code}")

print("\nTraffic Subtype mapping:")
for label, code in zip(traffic_subtype_encoder.classes_, traffic_subtype_encoder.transform(traffic_subtype_encoder.classes_)):
    print(f"{label} -> {code}")

# Memory optimization: Drop columns we don't need to save memory
# Convert to more memory-efficient data types where possible
for col in numeric_cols:
    if df_cleaned[col].max() < 32767 and df_cleaned[col].min() > -32768:
        df_cleaned[col] = df_cleaned[col].astype('int16')
    elif df_cleaned[col].max() < 2147483647 and df_cleaned[col].min() > -2147483648:
        df_cleaned[col] = df_cleaned[col].astype('int32')
    else:
        df_cleaned[col] = df_cleaned[col].astype('float32')

print("\nMemory usage after optimization:")
print(f"{df_cleaned.memory_usage().sum() / 1024**2:.2f} MB")

In [None]:
# Cell 4: Feature selection
# Check available columns in the dataframe
print("Available columns in the dataset:")
print(df_cleaned.columns.tolist())

# Define features for traffic classification - adjust if column names differ
# These are the standard features, but check if they exist in your dataframe
traffic_features = [col for col in [
    'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 
    'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
    'Flow Packets/s', 'Flow Bytes/s', 'Flow IAT Mean',
    'Fwd Packet Length Mean', 'Bwd Packet Length Mean',
    'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg',
    'Packet Length Mean', 'Packet Length Std'
] if col in df_cleaned.columns]

# If key features are missing, select the most relevant numeric features
if len(traffic_features) < 10:
    print("Some key features are missing. Selecting top numeric features instead.")
    # Select numeric features (excluding targets and IDs)
    exclude_cols = ['Label', 'Traffic Type', 'Traffic Subtype', 
                   'Label_Encoded', 'Traffic_Type_Encoded', 'Traffic_Subtype_Encoded',
                   'Flow ID', 'Src IP', 'Dst IP', 'Timestamp']
    potential_features = [col for col in df_cleaned.select_dtypes(include=['number']).columns 
                         if col not in exclude_cols]
    # Select top 15 features (or all if less than 15)
    traffic_features = potential_features[:min(15, len(potential_features))]

print(f"\nSelected {len(traffic_features)} features for traffic classification:")
print(traffic_features)

# Features and target for traffic classification
X_traffic = df_cleaned[traffic_features]
y_traffic_label = df_cleaned['Label_Encoded']
y_traffic_type = df_cleaned['Traffic_Type_Encoded']
y_traffic_subtype = df_cleaned['Traffic_Subtype_Encoded']

print(f"\nX shape: {X_traffic.shape}")
print(f"y_traffic_label shape: {y_traffic_label.shape}")
print(f"y_traffic_type shape: {y_traffic_type.shape}")
print(f"y_traffic_subtype shape: {y_traffic_subtype.shape}")

# Features for threat detection - adjust based on available columns
threat_features = [col for col in [
    'Flow Duration', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
    'Packet Length Mean', 'Flow Packets/s', 'Flow Bytes/s',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
    'Fwd IAT Mean', 'Bwd IAT Mean',
    'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 
    'FWD Init Win Bytes', 'Bwd Init Win Bytes'
] if col in df_cleaned.columns]

# If key features are missing, use the same features as traffic classification
if len(threat_features) < 10:
    print("Some key threat detection features are missing. Using traffic features instead.")
    threat_features = traffic_features

print(f"\nSelected {len(threat_features)} features for threat detection:")
print(threat_features)

# Features and target for threat detection
X_threat = df_cleaned[threat_features]
y_threat = df_cleaned['Label_Encoded']

print(f"\nX shape: {X_threat.shape}, y shape: {y_threat.shape}")    

In [None]:
# Cell 5: Feature scaling
scaler_traffic = StandardScaler()
X_traffic_scaled = scaler_traffic.fit_transform(X_traffic)

scaler_threat = StandardScaler()
X_threat_scaled = scaler_threat.fit_transform(X_threat)

print("Features scaled successfully!")

In [None]:
# Cell 6: Split data
# Split data for traffic classification (Label)
X_train_traffic_label, X_test_traffic_label, y_train_traffic_label, y_test_traffic_label = train_test_split(
    X_traffic_scaled, y_traffic_label, test_size=0.3, random_state=42, stratify=y_traffic_label
)

# Split data for traffic type classification
X_train_traffic_type, X_test_traffic_type, y_train_traffic_type, y_test_traffic_type = train_test_split(
    X_traffic_scaled, y_traffic_type, test_size=0.3, random_state=42, stratify=y_traffic_type
)

# Split data for traffic subtype classification
X_train_traffic_subtype, X_test_traffic_subtype, y_train_traffic_subtype, y_test_traffic_subtype = train_test_split(
    X_traffic_scaled, y_traffic_subtype, test_size=0.3, random_state=42, stratify=y_traffic_subtype
)

# Split data for threat detection
X_train_threat, X_test_threat, y_train_threat, y_test_threat = train_test_split(
    X_threat_scaled, y_threat, test_size=0.3, random_state=42, stratify=y_threat
)

print("Data split complete!")
print(f"Traffic Label Classification - Training set: {X_train_traffic_label.shape}, Test set: {X_test_traffic_label.shape}")
print(f"Traffic Type Classification - Training set: {X_train_traffic_type.shape}, Test set: {X_test_traffic_type.shape}")
print(f"Traffic Subtype Classification - Training set: {X_train_traffic_subtype.shape}, Test set: {X_test_traffic_subtype.shape}")
print(f"Threat Detection - Training set: {X_train_threat.shape}, Test set: {X_test_threat.shape}")

In [None]:
# Cell 7: Define model training and evaluation functions
def train_and_evaluate_model(X_train, X_test, y_train, y_test, class_names, model_type="traffic"):
    """Train and evaluate classification model"""
    
    # Initialize models
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
    
    # Train Random Forest model
    print(f"Training Random Forest model for {model_type} classification...")
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    
    # Train Decision Tree model
    print(f"Training Decision Tree model for {model_type} classification...")
    dt_model.fit(X_train, y_train)
    y_pred_dt = dt_model.predict(X_test)
    
    # Evaluate Random Forest model
    print("\nRandom Forest Model Evaluation:")
    rf_accuracy = accuracy_score(y_test, y_pred_rf)
    print(f"Accuracy: {rf_accuracy:.4f}")
    
    # Evaluate Decision Tree model
    print("\nDecision Tree Model Evaluation:")
    dt_accuracy = accuracy_score(y_test, y_pred_dt)
    print(f"Accuracy: {dt_accuracy:.4f}")
    
    # Print classification report for Random Forest
    print("\nRandom Forest Classification Report:")
    print(classification_report(y_test, y_pred_rf, target_names=class_names))
    
    return rf_model, dt_model, y_pred_rf, y_pred_dt

def plot_confusion_matrix(y_true, y_pred, class_names, title):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def plot_feature_importance(model, feature_names, title):
    """Plot feature importance"""
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 8))
    plt.title(title)
    plt.bar(range(len(importances)), importances[indices], align='center')
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()
    
    # Print top 10 features
    print("Top 10 important features:")
    for i in range(min(10, len(indices))):
        print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

In [None]:
# Cell 8: Train and evaluate traffic label classification model
label_class_names = label_encoder.classes_
rf_label, dt_label, y_pred_rf_label, y_pred_dt_label = train_and_evaluate_model(
    X_train_traffic_label, X_test_traffic_label, y_train_traffic_label, y_test_traffic_label, 
    label_class_names, "traffic label"
)

# Plot confusion matrix for traffic label classification (Random Forest)
plot_confusion_matrix(
    y_test_traffic_label, 
    y_pred_rf_label, 
    label_class_names,
    "Traffic Label Classification - Random Forest Confusion Matrix"
)

# Plot traffic label classification feature importance
plot_feature_importance(
    rf_label, 
    traffic_features, 
    "Traffic Label Classification Feature Importance"
)

# Plot decision tree
plt.figure(figsize=(20, 15))
plot_tree(
    dt_label, 
    feature_names=traffic_features, 
    class_names=label_class_names,
    filled=True, 
    rounded=True
)
plt.title("AI-Powered Traffic Label Classification Decision Tree")
plt.show()

In [None]:
# Cell 9: Train and evaluate traffic type classification model
type_class_names = traffic_type_encoder.classes_
rf_type, dt_type, y_pred_rf_type, y_pred_dt_type = train_and_evaluate_model(
    X_train_traffic_type, X_test_traffic_type, y_train_traffic_type, y_test_traffic_type, 
    type_class_names, "traffic type"
)

# Plot confusion matrix for traffic type classification (Random Forest)
plot_confusion_matrix(
    y_test_traffic_type, 
    y_pred_rf_type, 
    type_class_names,
    "Traffic Type Classification - Random Forest Confusion Matrix"
)

# Plot traffic type classification feature importance
plot_feature_importance(
    rf_type, 
    traffic_features, 
    "Traffic Type Classification Feature Importance"
)

In [None]:
# Cell 10: Train and evaluate traffic subtype classification model
subtype_class_names = traffic_subtype_encoder.classes_
rf_subtype, dt_subtype, y_pred_rf_subtype, y_pred_dt_subtype = train_and_evaluate_model(
    X_train_traffic_subtype, X_test_traffic_subtype, y_train_traffic_subtype, y_test_traffic_subtype, 
    subtype_class_names, "traffic subtype"
)

# Plot confusion matrix for traffic subtype classification (Random Forest)
plot_confusion_matrix(
    y_test_traffic_subtype, 
    y_pred_rf_subtype, 
    subtype_class_names,
    "Traffic Subtype Classification - Random Forest Confusion Matrix"
)

# Plot traffic subtype classification feature importance
plot_feature_importance(
    rf_subtype, 
    traffic_features, 
    "Traffic Subtype Classification Feature Importance"
)

In [None]:
# Cell 11: Train and evaluate threat detection model
rf_threat, dt_threat, y_pred_rf_threat, y_pred_dt_threat = train_and_evaluate_model(
    X_train_threat, X_test_threat, y_train_threat, y_test_threat, 
    label_class_names, "threat"
)

# Plot confusion matrix for threat detection (Random Forest)
plot_confusion_matrix(
    y_test_threat, 
    y_pred_rf_threat, 
    label_class_names,
    "Threat Detection - Random Forest Confusion Matrix"
)

# Plot threat detection feature importance
plot_feature_importance(
    rf_threat, 
    threat_features, 
    "Threat Detection Feature Importance"
)

plt.figure(figsize=(20, 15))
plot_tree(
    dt_threat, 
    feature_names=threat_features, 
    class_names=label_class_names,
    filled=True, 
    rounded=True
)
plt.title("Threat Detection & Anomaly Identification Decision Tree")
plt.show()

In [None]:
# Cell 12: Create prediction functions
def predict_traffic(model, scaler, features, feature_names, class_names):
    """
    Predict traffic characteristics for new data
    """
    # Create a dataframe with the feature names
    input_df = pd.DataFrame([features], columns=feature_names)
    
    # Scale the features
    scaled_features = scaler.transform(input_df)
    
    # Make prediction
    prediction = model.predict(scaled_features)
    
    # Get the predicted class name
    predicted_class = class_names[prediction[0]]
    
    # Get prediction probabilities
    probabilities = model.predict_proba(scaled_features)[0]
    
    return predicted_class, probabilities

In [None]:
# Cell 13: Example prediction
# Example new traffic data (replace with actual values)
# Format: [Flow Duration, Total Fwd Packet, Total Bwd packets, ...]
# These values should match the order of traffic_features

new_traffic_data = [
    5.0,        # Flow Duration
    50,         # Total Fwd Packet
    30,         # Total Bwd packets
    10000,      # Total Length of Fwd Packet
    5000,       # Total Length of Bwd Packet
    16.0,       # Flow Packets/s
    3000.0,     # Flow Bytes/s
    0.1,        # Flow IAT Mean
    200,        # Fwd Packet Length Mean
    166.6,      # Bwd Packet Length Mean
    185.7,      # Average Packet Size
    200,        # Fwd Segment Size Avg
    166.6,      # Bwd Segment Size Avg
    185.7,      # Packet Length Mean
    20          # Packet Length Std
]

# Make predictions for the new traffic data
print("\n--- Traffic Label Prediction ---")
predicted_label, label_probabilities = predict_traffic(
    rf_label, 
    scaler_traffic, 
    new_traffic_data, 
    traffic_features, 
    label_class_names
)

print(f"Predicted traffic label: {predicted_label}")
print("Prediction probabilities:")
for i, class_name in enumerate(label_class_names):
    print(f"{class_name}: {label_probabilities[i]:.4f}")

print("\n--- Traffic Type Prediction ---")
predicted_type, type_probabilities = predict_traffic(
    rf_type, 
    scaler_traffic, 
    new_traffic_data, 
    traffic_features, 
    type_class_names
)

print(f"Predicted traffic type: {predicted_type}")
print("Prediction probabilities:")
for i, class_name in enumerate(type_class_names):
    print(f"{class_name}: {type_probabilities[i]:.4f}")

print("\n--- Traffic Subtype Prediction ---")
predicted_subtype, subtype_probabilities = predict_traffic(
    rf_subtype, 
    scaler_traffic, 
    new_traffic_data, 
    traffic_features, 
    subtype_class_names
)

print(f"Predicted traffic subtype: {predicted_subtype}")
print("Prediction probabilities:")
for i, class_name in enumerate(subtype_class_names):
    print(f"{class_name}: {subtype_probabilities[i]:.4f}")

In [None]:
# Cell 14: Save models
# Create models directory if it doesn't exist
import os
os.makedirs('models', exist_ok=True)

# Save the traffic classification models
with open('models/rf_label_model.pkl', 'wb') as f:
    pickle.dump(rf_label, f)

with open('models/dt_label_model.pkl', 'wb') as f:
    pickle.dump(dt_label, f)

with open('models/rf_type_model.pkl', 'wb') as f:
    pickle.dump(rf_type, f)

with open('models/dt_type_model.pkl', 'wb') as f:
    pickle.dump(dt_type, f)

with open('models/rf_subtype_model.pkl', 'wb') as f:
    pickle.dump(rf_subtype, f)

with open('models/dt_subtype_model.pkl', 'wb') as f:
    pickle.dump(dt_subtype, f)

# Save the threat detection models
with open('models/rf_threat_model.pkl', 'wb') as f:
    pickle.dump(rf_threat, f)

with open('models/dt_threat_model.pkl', 'wb') as f:
    pickle.dump(dt_threat, f)

# Save the scalers
with open('models/scaler_traffic.pkl', 'wb') as f:
    pickle.dump(scaler_traffic, f)

with open('models/scaler_threat.pkl', 'wb') as f:
    pickle.dump(scaler_threat, f)

# Save the encoders
with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

with open('models/traffic_type_encoder.pkl', 'wb') as f:
    pickle.dump(traffic_type_encoder, f)

with open('models/traffic_subtype_encoder.pkl', 'wb') as f:
    pickle.dump(traffic_subtype_encoder, f)

print("Models, scalers, and encoders saved successfully in the 'models' directory!")