In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
missing_df =pd.read_csv(OUTPUT_DIR / "missing_data_summary.csv")
high_corr= pd.read_csv(OUTPUT_DIR / "high_correlation_pairs.csv")
const_cols= pd.read_csv(OUTPUT_DIR / "constant_features.csv")
nzv_cols =pd.read_csv(OUTPUT_DIR / "nzv_features.csv")
id_like_cols= pd.read_csv(OUTPUT_DIR / "id_like_features.csv")

In [None]:
feature_names=["Fwd Header Length", "Fwd Packet Length Min","Fwd Avg Bytes/Bulk","Fwd Packet Length Max", 
                 "Total Length of Fwd Packets ", "Total Backward Packets","Bwd IAT Mean"]
missing_percentages =[3.79, 2.84,2.35,2.23, 1.73, 1.59,0.12]
top_missing =pd.DataFrame({"feature": feature_names,"missing_percent": missing_percentages})

plt.figure(figsize=(8, 4))
sns.barplot(data=top_missing,x="missing_percent",y="feature",palette="crest",orient="h")

plt.title("top features with missing values")
plt.xlabel("missing percentage (%)")
plt.ylabel("feature name")

for i, (value, feature) in enumerate(zip(top_missing["missing_percent"], top_missing["feature"])):
    plt.text(value + 0.05, i, f"{value:.2f}%", va="center", fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# feature type distribution
type_counts={"numeric": len(pd.read_csv("numeric_summary.csv")),"categorical": len(pd.read_csv("categorical_features.csv")),
               "boolean": len(pd.read_csv("boolean_features.csv")),"datetime-like": len(pd.read_csv("datetime_features.csv")),}
#percentages
total=sum(type_counts.values())
percentages= [v/total*100 for v in type_counts.values()]

plt.figure(figsize=(6, 4))
ax =sns.barplot(x=list(type_counts.keys()),y=list(type_counts.values()),palette="pastel")
plt.title("distribution of feature types")
plt.ylabel("number of features")
plt.xlabel("feature type")

for i, (count, pct) in enumerate(zip(type_counts.values(),percentages)):
    ax.text(i,count+total*0.01,f"{pct:.1f}%",
            ha="center",va="bottom",fontsize=9,fontweight="bold",color="black")
plt.tight_layout()
plt.show()

In [None]:
#constant, non zero veriance, and ID-like features
counts={"constant": len(const_cols),"near-zero variance": len(nzv_cols),"id-like": len(id_like_cols),}

plt.figure(figsize=(6, 4))
sns.barplot(x=list(counts.keys()),y=list(counts.values()),palette="flare")
plt.title("counts of removed or flagged features")
plt.ylabel("number of features")
plt.tight_layout()
plt.show()

In [None]:
top_corr=high_corr.head(15)
# Pivot the pair list into a small symmetric matrix
matrix=(top_corr.pivot(index="feature_a", columns="feature_b", values="pearson_r").fillna(0))

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(matrix,annot=True,cmap="coolwarm",cbar=True,square=True,fmt=".2f",linewidths=0.5)
plt.title("highly correlated feature pairs (heatmap)")
plt.xlabel("feature b")
plt.ylabel("feature a")
plt.tight_layout()
plt.show()