In [1]:
# --- Setup Environment ---

import sys
import os

# Add Code folder (where sarima_retraining.py is) to Python path
CODE_DIR = r"X:\\DataSet\\Code"
DATA_DIR = r"X:\\DataSet\\Data"

if CODE_DIR not in sys.path:
    sys.path.append(CODE_DIR)

# Now you can import custom scripts like sarima_retraining.py
try:
    import sarima_retraining
    print("✅ Successfully imported sarima_retraining.py")
except ImportError as e:
    print(f"❌ Failed to import: {e}")

# Global path to dataset
print(f"📂 Data folder is set to: {DATA_DIR}")

# --- After this cell, you can proceed to run analyze-results.ipynb normally ---

✅ Successfully imported sarima_retraining.py
📂 Data folder is set to: X:\\DataSet\\Data


In [2]:
# Second cell: Perform the analysis after setting up paths and importing Sarina

import os
import pandas as pd
import matplotlib.pyplot as plt
import sarina  # This assumes sarina.py is in the correct folder

# Settings
DATASET_DIR = r"X:\DataSet"
DATA_DIR = os.path.join(DATASET_DIR, "Data")
INSTITUTION_DIR = os.path.join(DATASET_DIR, "institutions")
SUBNET_DIR = os.path.join(DATASET_DIR, "institution_subnets")

# Load the institution information
institution_df = sarina.load_institutions(INSTITUTION_DIR)
subnet_df = sarina.load_subnets(SUBNET_DIR)

# Load the dataset (sample or full depending on your needs)
all_files = []
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(".csv"):
            all_files.append(os.path.join(root, file))

print(f"Loaded {len(all_files)} CSV files.")

# Merge all CSVs into one DataFrame
list_df = []
for file in all_files:
    try:
        df = pd.read_csv(file)
        list_df.append(df)
    except Exception as e:
        print(f"Failed to load {file}: {e}")

data = pd.concat(list_df, ignore_index=True)
print(f"Merged dataset shape: {data.shape}")

# Prepare data
features = ['n_flows', 'n_packets', 'n_bytes']
data = data.dropna(subset=features)

# Run anomaly detection using Isolation Forest
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(data[features])

iso_model = IsolationForest(contamination=0.01, random_state=42)
iso_model.fit(X_scaled)

preds = iso_model.predict(X_scaled)

data['anomaly'] = preds

# Plot graphs
fig, axes = plt.subplots(len(features), 1, figsize=(15, 8), sharex=True)
if len(features) == 1:
    axes = [axes]

for idx, feat in enumerate(features):
    ax = axes[idx]
    ax.plot(data.index, data[feat], label=feat, color='blue')
    anomaly_idx = data[data['anomaly'] == -1].index
    ax.scatter(anomaly_idx, data.loc[anomaly_idx, feat], color='red', label='Anomaly', s=10)
    ax.set_ylabel(feat)
    ax.legend()
    ax.grid(True)

plt.xlabel('Index')
plt.suptitle('Isolation Forest Anomaly Detection on CESNET Dataset', fontsize=16)
plt.tight_layout()
plt.show()

print("✅ Analysis and plotting complete!")


ModuleNotFoundError: No module named 'sarina'