<a href="https://colab.research.google.com/github/Raziasultan-786/machine-learning-01/blob/main/CICIDS_2017_ML_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CMP7239 Applied Machine Learning Assignment
## Network Intrusion Detection using CICIDS 2017 Dataset

**Student:** [Your Name]  
**University:** Birmingham City University  
**Course:** CMP7239 Applied Machine Learning  
**Domain:** Cybersecurity - Network Intrusion Detection  

### Assignment Overview
This notebook implements a complete machine learning pipeline for network intrusion detection using the CICIDS 2017 dataset. The analysis includes data preprocessing, exploratory data analysis, implementation of multiple ML algorithms, and comprehensive performance evaluation.

### Dataset Information
The CICIDS 2017 dataset contains network traffic data with various types of attacks including:
- DDoS attacks
- Port Scan attacks
- Web attacks
- Infiltration attacks
- Normal traffic


## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)

# Machine Learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, auc
)

# Optional: XGBoost (uncomment if available)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("XGBoost is available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available, will use Logistic Regression instead")

# Set random seed for reproducibility
np.random.seed(42)

print("All libraries imported successfully!")

## 2. Data Loading and Initial Exploration

In [None]:
def load_cicids_data():
    """
    Load and combine all CICIDS 2017 dataset files
    Returns: Combined DataFrame
    """
    # List of dataset files
    dataset_files = [
        'Dataset/Monday-WorkingHours.pcap_ISCX.csv',
        'Dataset/Tuesday-WorkingHours.pcap_ISCX.csv',
        'Dataset/Wednesday-workingHours.pcap_ISCX.csv',
        'Dataset/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
        'Dataset/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
        'Dataset/Friday-WorkingHours-Morning.pcap_ISCX.csv',
        'Dataset/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
        'Dataset/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv'
    ]

    dataframes = []

    for file in dataset_files:
        try:
            print(f"Loading {file}...")
            df = pd.read_csv(file)
            print(f"  Shape: {df.shape}")
            dataframes.append(df)
        except Exception as e:
            print(f"Error loading {file}: {e}")

    # Combine all dataframes
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        print(f"\nCombined dataset shape: {combined_df.shape}")
        return combined_df
    else:
        print("No data loaded successfully")
        return None

# Load the dataset
print("=== LOADING CICIDS 2017 DATASET ===")
df = load_cicids_data()

In [None]:
# Initial data exploration
if df is not None:
    print("=== INITIAL DATA EXPLORATION ===")
    print(f"Dataset shape: {df.shape}")
    print(f"\nColumn names:")
    for i, col in enumerate(df.columns):
        print(f"{i+1:2d}. {col}")

    print(f"\nData types:")
    print(df.dtypes.value_counts())

    print(f"\nFirst few rows:")
    display(df.head())
else:
    print("Dataset not loaded properly")

## 3. Data Cleaning and Preprocessing

In [None]:
def clean_dataset(df):
    """
    Clean the CICIDS dataset by handling missing values, duplicates, and data types
    """
    print("=== DATA CLEANING ===")

    # Make a copy to avoid modifying original
    df_clean = df.copy()

    # Check for missing values
    print("Missing values per column:")
    missing_values = df_clean.isnull().sum()
    missing_percent = (missing_values / len(df_clean)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_values,
        'Percentage': missing_percent
    })
    print(missing_df[missing_df['Missing Count'] > 0])

    # Handle missing values
    if missing_values.sum() > 0:
        print("\nHandling missing values...")
        # For numerical columns, fill with median
        numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
        for col in numerical_cols:
            if df_clean[col].isnull().sum() > 0:
                df_clean[col].fillna(df_clean[col].median(), inplace=True)

        # For categorical columns, fill with mode
        categorical_cols = df_clean.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            if df_clean[col].isnull().sum() > 0:
                df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

    # Check for duplicates
    duplicates = df_clean.duplicated().sum()
    print(f"\nDuplicate rows: {duplicates}")
    if duplicates > 0:
        df_clean = df_clean.drop_duplicates()
        print(f"Removed {duplicates} duplicate rows")

    # Handle infinite values
    print("\nChecking for infinite values...")
    inf_cols = []
    numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        if np.isinf(df_clean[col]).sum() > 0:
            inf_cols.append(col)
            # Replace infinite values with column max/min
            df_clean[col].replace([np.inf, -np.inf], [df_clean[col].max(), df_clean[col].min()], inplace=True)

    if inf_cols:
        print(f"Handled infinite values in columns: {inf_cols}")
    else:
        print("No infinite values found")

    print(f"\nCleaned dataset shape: {df_clean.shape}")
    return df_clean

# Clean the dataset
if df is not None:
    df_clean = clean_dataset(df)
else:
    print("Cannot clean dataset - data not loaded")

In [None]:
def prepare_features_and_target(df):
    """
    Prepare features and target variable for machine learning
    """
    print("=== FEATURE PREPARATION ===")

    # Identify the target column (usually 'Label' in CICIDS dataset)
    target_col = None
    possible_target_names = ['Label', 'label', 'Label ', ' Label']

    for col_name in possible_target_names:
        if col_name in df.columns:
            target_col = col_name
            break

    if target_col is None:
        # If no standard label column found, use the last column
        target_col = df.columns[-1]
        print(f"No standard label column found, using last column: {target_col}")

    print(f"Target column: {target_col}")

    # Check target distribution
    print(f"\nTarget distribution:")
    target_counts = df[target_col].value_counts()
    print(target_counts)

    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Handle categorical features
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    print(f"\nCategorical features: {categorical_features}")

    # Encode categorical features
    label_encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

    # Encode target variable
    target_encoder = LabelEncoder()
    y_encoded = target_encoder.fit_transform(y.astype(str))

    print(f"\nFeature matrix shape: {X.shape}")
    print(f"Target vector shape: {y_encoded.shape}")
    print(f"Number of classes: {len(target_encoder.classes_)}")
    print(f"Classes: {target_encoder.classes_}")

    return X, y_encoded, target_encoder, label_encoders

# Prepare features and target
if 'df_clean' in locals():
    X, y, target_encoder, label_encoders = prepare_features_and_target(df_clean)
else:
    print("Cannot prepare features - cleaned data not available")