In [1]:
import pandas as pd
from scipy.stats import f_oneway
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
#mount google drive

from google.colab import drive

drive.mount('/content/drive')

path = "/content/drive/MyDrive/"

combinedDf = pd.read_csv(path + 'Combined.csv')


Mounted at /content/drive


  combinedDf = pd.read_csv(path + 'Combined.csv')


In [None]:
# Information about data types and missing values
print(combinedDf.info())

# Summary statistics for numerical features
numerical_features = combinedDf.select_dtypes(include=['int64', 'float64']).columns
print(combinedDf[numerical_features].describe())

# Distributions of continuous features
for feature in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(combinedDf[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()

# Value counts for categorical features
categorical_features = combinedDf.select_dtypes(include=['object']).columns
for feature in categorical_features:
    print(combinedDf[feature].value_counts())


In [7]:
# Remove columns with high cardinality
high_cardinality_threshold = 100000
high_cardinality_columns = combinedDf.nunique()[combinedDf.nunique() > high_cardinality_threshold].index
#combinedDf = combinedDf.drop(high_cardinality_columns, axis=1)
print(high_cardinality_columns)

# Select features based on ANOVA F-scores and singular value dominance
selected_features = []
for column in combinedDf.columns:
    if combinedDf[column].dtype in ['int64', 'float64']:
        f_statistic, p_value = f_oneway(*[group[column] for name, group in combinedDf.groupby('Label')])
        if p_value < 0.05:
            dominant_value_percentage = combinedDf[column].value_counts(normalize=True).max() * 100
            if dominant_value_percentage <= 80:
                selected_features.append(column)

remaining_features = [featureName for featureName in selected_features if featureName not in ['Label', 'Attack Type', 'Attack Tool']]
combinedDf = combinedDf[remaining_features + ['Label', 'Attack Type', 'Attack Tool']]


In [8]:
# Handle missing values (implement appropriate strategy here)

# Scale numerical features
scaler = StandardScaler()
combinedDf[remaining_features] = scaler.fit_transform(combinedDf[remaining_features])


In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(combinedDf.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Pairwise scatter plots for numerical features
#sns.pairplot(combinedDf[numerical_features])
#plt.show()

# Distributions of numerical features (after scaling)
for feature in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(combinedDf[feature], kde=True)
    plt.title(f'Distribution of {feature} (Scaled)')
    plt.show()

# Bar plots for categorical features
for feature in categorical_features:
    plt.figure(figsize=(8, 6))
    sns.countplot(data=combinedDf, x=feature, palette='Set2')
    plt.title(f'Bar Plot of {feature}')
    plt.xticks(rotation=45)
    plt.show()
