<a href="https://colab.research.google.com/github/Raziasultan-786/machine-learning-01/blob/main/CICIDS_2017_ML_Analysis_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CMP7239 Applied Machine Learning Assignment - Part 2
## Exploratory Data Analysis and Machine Learning Models

**Note:** This is a continuation of the main analysis notebook. Run the first notebook (CICIDS_2017_ML_Analysis.ipynb) before running this one.


## 4. Exploratory Data Analysis (EDA)

In [None]:
def perform_eda(X, y, target_encoder):
    """
    Perform comprehensive exploratory data analysis
    """
    print("=== EXPLORATORY DATA ANALYSIS ===")

    # Summary statistics
    print("\n1. SUMMARY STATISTICS")
    print("=" * 50)
    print(f"Dataset shape: {X.shape}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of samples: {X.shape[0]}")
    print(f"Number of classes: {len(target_encoder.classes_)}")

    # Feature statistics
    print("\nFeature Statistics:")
    feature_stats = X.describe()
    display(feature_stats)

    # Class distribution
    print("\n2. CLASS DISTRIBUTION")
    print("=" * 50)
    class_counts = pd.Series(y).value_counts().sort_index()
    class_names = target_encoder.classes_

    for i, count in enumerate(class_counts):
        percentage = (count / len(y)) * 100
        print(f"{class_names[i]}: {count:,} samples ({percentage:.2f}%)")

    # Visualizations
    print("\n3. VISUALIZATIONS")
    print("=" * 50)

    # Class distribution plot
    plt.figure(figsize=(15, 6))
    plt.subplot(1, 2, 1)
    class_counts.plot(kind='bar')
    plt.title('Class Distribution (Count)')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.xticks(range(len(class_names)), class_names, rotation=45)

    plt.subplot(1, 2, 2)
    class_percentages = (class_counts / len(y)) * 100
    plt.pie(class_percentages, labels=class_names, autopct='%1.1f%%', startangle=90)
    plt.title('Class Distribution (Percentage)')

    plt.tight_layout()
    plt.show()

    return feature_stats, class_counts

# Perform EDA
if 'X' in locals() and 'y' in locals():
    feature_stats, class_counts = perform_eda(X, y, target_encoder)
else:
    print("Cannot perform EDA - features and target not prepared")

In [None]:
def plot_correlation_analysis(X, sample_size=10000):
    """
    Plot correlation heatmap and feature distributions
    """
    print("\n4. CORRELATION ANALYSIS")
    print("=" * 50)

    # Sample data if too large for correlation matrix
    if len(X) > sample_size:
        print(f"Sampling {sample_size} rows for correlation analysis...")
        X_sample = X.sample(n=sample_size, random_state=42)
    else:
        X_sample = X

    # Calculate correlation matrix
    correlation_matrix = X_sample.corr()

    # Plot correlation heatmap
    plt.figure(figsize=(15, 12))

    # If too many features, show only top correlations
    if len(X.columns) > 20:
        # Get features with highest variance
        feature_variance = X_sample.var().sort_values(ascending=False)
        top_features = feature_variance.head(20).index
        correlation_subset = correlation_matrix.loc[top_features, top_features]

        sns.heatmap(correlation_subset, annot=False, cmap='coolwarm', center=0,
                   square=True, linewidths=0.5)
        plt.title('Correlation Heatmap (Top 20 Features by Variance)')
    else:
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                   square=True, linewidths=0.5, fmt='.2f')
        plt.title('Feature Correlation Heatmap')

    plt.tight_layout()
    plt.show()

    # Find highly correlated features
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > 0.8:
                high_corr_pairs.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ))

    if high_corr_pairs:
        print(f"\nHighly correlated feature pairs (|correlation| > 0.8):")
        for feat1, feat2, corr in high_corr_pairs[:10]:  # Show top 10
            print(f"{feat1} - {feat2}: {corr:.3f}")
    else:
        print("\nNo highly correlated feature pairs found (|correlation| > 0.8)")

    return correlation_matrix

# Perform correlation analysis
if 'X' in locals():
    correlation_matrix = plot_correlation_analysis(X)
else:
    print("Cannot perform correlation analysis - features not prepared")

## 5. Feature Scaling and Data Splitting

In [None]:
def prepare_data_for_ml(X, y, test_size=0.2, random_state=42):
    """
    Prepare data for machine learning: scaling and train/test split
    """
    print("=== DATA PREPARATION FOR MACHINE LEARNING ===")

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    print(f"Training set class distribution:")
    print(pd.Series(y_train).value_counts().sort_index())

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("\nFeatures scaled using StandardScaler")
    print(f"Training set mean: {X_train_scaled.mean():.6f}")
    print(f"Training set std: {X_train_scaled.std():.6f}")

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

# Prepare data for ML
if 'X' in locals() and 'y' in locals():
    X_train, X_test, y_train, y_test, scaler = prepare_data_for_ml(X, y)
else:
    print("Cannot prepare data for ML - features and target not available")