In [None]:
import os


class DDoSDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class DDoSTransformer(nn.Module):
    def __init__(
        self, input_dim, num_heads=4, num_layers=2, dim_feedforward=128, dropout=0.1
    ):
        super(DDoSTransformer, self).__init__()

        self.input_projection = nn.Linear(input_dim, dim_feedforward)

        self.pos_encoder = nn.Sequential(
            nn.Linear(dim_feedforward, dim_feedforward), nn.ReLU(), nn.Dropout(dropout)
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_feedforward,
            nhead=num_heads,
            dim_feedforward=dim_feedforward * 2,
            dropout=dropout,
            batch_first=True,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )

        self.output_layer = nn.Sequential(
            nn.Linear(dim_feedforward, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2),
        )

    def forward(self, x):
        x = self.input_projection(x)

        x = self.pos_encoder(x)

        x = self.transformer_encoder(x)

        x = x.mean(dim=1) if len(x.shape) > 2 else x
        x = self.output_layer(x)
        return x


def discover_datasets(data_dir="datasets", patterns=None):
    """
    Automatically discover datasets in the specified directory

    Args:
        data_dir: Directory to search for datasets
        patterns: List of file patterns to match (e.g., ["*.csv", "*.parquet"])

    Returns:
        Dictionary with dataset names as keys and file paths as values
    """
    if patterns is None:
        patterns = ["*.csv", "*.parquet", "*.pkl", "*.pickle"]

    dataset_paths = {}

    if not os.path.exists(data_dir):
        print(f"Dataset directory '{data_dir}' not found. Creating it.")
        os.makedirs(data_dir, exist_ok=True)
        return dataset_paths

    for pattern in patterns:
        search_pattern = os.path.join(data_dir, pattern)
        for file_path in glob.glob(search_pattern):
            dataset_name = os.path.splitext(os.path.basename(file_path))[0]
            dataset_paths[dataset_name] = file_path

    if not dataset_paths:
        print(f"No datasets found in '{data_dir}' matching patterns: {patterns}")
    else:
        print(f"Discovered {len(dataset_paths)} datasets: {list(dataset_paths.keys())}")

    return dataset_paths


def load_datasets(dataset_paths, label_column="Label", feature_columns=None):
    """
    Load multiple datasets from the provided paths

    Args:
        dataset_paths: Dictionary with dataset names as keys and file paths as values
        label_column: Name of the label column in the datasets
        feature_columns: Specific feature columns to use (None for all)

    Returns:
        Dictionary with dataset names as keys and (features, labels) tuples as values
    """
    datasets = {}

    for dataset_name, file_path in dataset_paths.items():
        print(f"Loading dataset: {dataset_name} from {file_path}")

        try:
            if file_path.endswith('.csv'):
                data = pd.read_csv(file_path)
            elif file_path.endswith('.parquet'):
                data = pd.read_parquet(file_path)
            elif file_path.endswith('.pkl') or file_path.endswith('.pickle'):
                data = pd.read_pickle(file_path)
            else:
                print(f"Unsupported file format for {file_path}. Skipping.")
                continue
            if label_column in data.columns:
                if feature_columns is not None:
                    missing_cols = [col for col in feature_columns if col not in data.columns]
                    if missing_cols:
                        print(f"Warning: Columns {missing_cols} not found in {dataset_name}")
                    available_cols = [col for col in feature_columns if col in data.columns]
                    X = data[available_cols].values
                else:
                    X = data.drop(label_column, axis=1).values

                y = data[label_column].values

                if not np.all(np.isin(np.unique(y), [0, 1])):
                    print(f"Converting labels for {dataset_name} to binary format")
                    unique_labels = np.unique(y)
                    label_map = {unique_labels[i]: i for i in range(len(unique_labels))}
                    y = np.array([label_map[label] for label in y])

                datasets[dataset_name] = (X, y)
                print(f"Successfully loaded {dataset_name}: {X.shape[0]} samples, {X.shape[1]} features")
            else:
                print(f"Label column '{label_column}' not found in {dataset_name}. Skipping.")
        except Exception as e:
            print(f"Error loading {dataset_name}: {str(e)}")

    return datasets


# Feature alignment for cross-dataset compatibility
def align_features(datasets, feature_dim=None, strategy="pad_truncate"):
    """
    Align features across different datasets to make them compatible

    Args:
        datasets: Dictionary with dataset names as keys and (features, labels) tuples as values
        feature_dim: Target feature dimension (None to use the max dimension)
        strategy: Strategy to use for alignment ("pad_truncate" or "pca")

    Returns:
        Dictionary with aligned datasets and a common feature dimension
    """
    if not datasets:
        return {}, 0

    dimensions = {name: X.shape[1] for name, (X, _) in datasets.items()}

    if feature_dim is None:
        feature_dim = max(dimensions.values())

    print(f"Aligning all datasets to {feature_dim} features using strategy: {strategy}")

    aligned_datasets = {}

    for name, (X, y) in datasets.items():
        current_dim = X.shape[1]

        if current_dim == feature_dim:
            aligned_datasets[name] = (X, y)
        elif current_dim < feature_dim:
            if strategy == "pad_truncate":
                padding = np.zeros((X.shape[0], feature_dim - current_dim))
                X_aligned = np.hstack((X, padding))
                print(f"Padded {name} from {current_dim} to {feature_dim} features")
                aligned_datasets[name] = (X_aligned, y)
            else:
                print(f"Cannot use PCA to increase dimensions. Padding {name} instead.")
                padding = np.zeros((X.shape[0], feature_dim - current_dim))
                X_aligned = np.hstack((X, padding))
                aligned_datasets[name] = (X_aligned, y)
        else:
            if strategy == "pad_truncate":
                X_aligned = X[:, :feature_dim]
                print(f"Truncated {name} from {current_dim} to {feature_dim} features")
                aligned_datasets[name] = (X_aligned, y)
            else:
                from sklearn.decomposition import PCA
                pca = PCA(n_components=feature_dim)
                X_aligned = pca.fit_transform(X)
                print(f"Reduced {name} from {current_dim} to {feature_dim} features using PCA")
                aligned_datasets[name] = (X_aligned, y)

    return aligned_datasets, feature_dim


def create_train_test_combinations(datasets, test_size=0.2, random_state=42, cross_dataset=True):
    """
    Create various train/test combinations from the available datasets

    Args:
        datasets: Dictionary with dataset names as keys and (features, labels) tuples as values
        test_size: Proportion to use for testing when splitting individual datasets
        random_state: Random seed for reproducibility
        cross_dataset: Whether to create cross-dataset combinations

    Returns:
        List of dictionaries with train/test dataset configurations
    """
    combinations = []

    for dataset_name, (X, y) in datasets.items():
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )

        combinations.append({
            'name': f"{dataset_name}_internal_split",
            'train': {
                'X': X_train,
                'y': y_train,
                'dataset': dataset_name,
                'split': 'train'
            },
            'test': {
                'X': X_test,
                'y': y_test,
                'dataset': dataset_name,
                'split': 'test'
            }
        })
    if cross_dataset and len(datasets) > 1:
        dataset_names = list(datasets.keys())

        for train_name, test_name in itertools.permutations(dataset_names, 2):
            X_train, y_train = datasets[train_name]
            X_test, y_test = datasets[test_name]

            combinations.append({
                'name': f"train_{train_name}_test_{test_name}",
                'train': {
                    'X': X_train,
                    'y': y_train,
                    'dataset': train_name,
                    'split': 'full'
                },
                'test': {
                    'X': X_test,
                    'y': y_test,
                    'dataset': test_name,
                    'split': 'full'
                }
            })

    return combinations

def prepare_dataset_combination(combination, batch_size=32):
    """
    Prepare a specific train/test combination for model training and evaluation

    Args:
        combination: Dictionary with train/test dataset configuration
        batch_size: Batch size for DataLoader

    Returns:
        Dictionary with prepared data loaders and related information
    """
    X_train = combination['train']['X']
    y_train = combination['train']['y']
    X_test = combination['test']['X']
    y_test = combination['test']['y']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    train_dataset = DDoSDataset(X_train_scaled, y_train)
    test_dataset = DDoSDataset(X_test_scaled, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    prepared_data = {
        'name': combination['name'],
        'train_loader': train_loader,
        'test_loader': test_loader,
        'scaler': scaler,
        'input_dim': X_train.shape[1],
        'train_info': combination['train'],
        'test_info': combination['test'],
        'X_test': X_test_scaled,
        'y_test': y_test
    }

    return prepared_data

def train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device="cuda",
    experiment_name="unnamed", early_stopping_patience=5, model_dir="models"
):
    model = model.to(device)
    best_val_loss = float("inf")


    os.makedirs(model_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    best_model_path = os.path.join(model_dir, f"best_model_{experiment_name}_{timestamp}.pth")

    patience_counter = 0
    early_stop = False

    training_history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': []
    }

    for epoch in range(num_epochs):
        if early_stop:
            print(f"Early stopping triggered after {epoch} epochs")
            break

        model.train()
        train_loss = 0
        correct = 0
        total = 0

        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(
                device
            )

            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()

        epoch_train_loss = train_loss/len(train_loader)
        train_accuracy = 100.*correct/total

        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch_features, batch_labels in val_loader:
                batch_features, batch_labels = batch_features.to(
                    device
                ), batch_labels.to(device)
                outputs = model(batch_features)
                loss = criterion(outputs, batch_labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += batch_labels.size(0)
                val_correct += predicted.eq(batch_labels).sum().item()

        epoch_val_loss = val_loss/len(val_loader)
        val_accuracy = 100.*val_correct/val_total

        training_history['train_loss'].append(epoch_train_loss)
        training_history['train_acc'].append(train_accuracy)
        training_history['val_loss'].append(epoch_val_loss)
        training_history['val_acc'].append(val_accuracy)

        print(f"Epoch {epoch+1}/{num_epochs} for {experiment_name}:")
        print(
            f"Train Loss: {epoch_train_loss:.4f}, Accuracy: {train_accuracy:.2f}%"
        )
        print(
            f"Val Loss: {epoch_val_loss:.4f}, Accuracy: {val_accuracy:.2f}%"
        )

        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            torch.save(model.state_dict(), best_model_path)
            print(f"Best model saved with validation loss: {best_val_loss:.4f}")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                early_stop = True
                print(f"No improvement for {early_stopping_patience} epochs. Early stopping.")

    print(f"Training complete for {experiment_name}. Best model saved to '{best_model_path}'.")

    history_path = os.path.join(model_dir, f"training_history_{experiment_name}_{timestamp}.json")
    with open(history_path, 'w') as f:
        json.dump(training_history, f)

    return best_model_path


def evaluate_model(model, test_loader, criterion, device="cuda"):
    model.eval()
    test_loss = 0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            batch_features = batch_features.to(device)
            batch_labels = batch_labels.to(device)

            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)

            test_loss += loss.item()
            _, predicted = outputs.max(1)

            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(batch_labels.cpu().numpy())

    accuracy = accuracy_score(all_targets, all_predictions)
    precision = precision_score(all_targets, all_predictions, zero_division=0)
    recall = recall_score(all_targets, all_predictions, zero_division=0)
    f1 = f1_score(all_targets, all_predictions, zero_division=0)
    conf_matrix = confusion_matrix(all_targets, all_predictions)

    results = {
        'test_loss': test_loss / len(test_loader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix.tolist()
    }

    return results


def run_experiments(data_dir="datasets", num_epochs=100, device_str=None,
                   model_dir="models", results_dir="results", cross_dataset=True,
                   feature_alignment="pad_truncate", batch_size=32):
    if device_str is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        device = torch.device(device_str)

    print(f"Using device: {device}")
    
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(results_dir, exist_ok=True)
    dataset_paths = discover_datasets(data_dir)
    if not dataset_paths:
        print("No datasets were discovered. Creating demo datasets for testing.")
        os.makedirs(data_dir, exist_ok=True)
        dataset_paths = create_demo_datasets(num_datasets=3, samples_per_dataset=1000, output_dir=data_dir)

 
    datasets = load_datasets(dataset_paths)
    if not datasets:
        print("No datasets were successfully loaded. Exiting.")
        return

    aligned_datasets, common_feature_dim = align_features(datasets, strategy=feature_alignment)

 
    combinations = create_train_test_combinations(aligned_datasets, cross_dataset=cross_dataset)
    print(f"Created {len(combinations)} train/test combinations")


    all_results = {}

    for combination in combinations:
        experiment_name = combination['name']
        print(f"\n=== Running experiment: {experiment_name} ===")
        print(f"Training on: {combination['train']['dataset']} ({combination['train']['split']})")
        print(f"Testing on: {combination['test']['dataset']} ({combination['test']['split']})")

        prepared_data = prepare_dataset_combination(combination, batch_size=batch_size)

        input_dim = prepared_data['input_dim']
        model = DDoSTransformer(input_dim=input_dim)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

      
        best_model_path = train_model(
            model,
            prepared_data['train_loader'],
            prepared_data['test_loader'],  
            criterion,
            optimizer,
            num_epochs=num_epochs,
            device=device,
            experiment_name=experiment_name,
            model_dir=model_dir
        )

        model.load_state_dict(torch.load(best_model_path))

        # Evaluate model
        evaluation_results = evaluate_model(model, prepared_data['test_loader'], criterion, device=device)

        print(f"\nResults for {experiment_name}:")
        print(f"Accuracy: {evaluation_results['accuracy'] * 100:.2f}%")
        print(f"Precision: {evaluation_results['precision'] * 100:.2f}%")
        print(f"Recall: {evaluation_results['recall'] * 100:.2f}%")
        print(f"F1 Score: {evaluation_results['f1_score'] * 100:.2f}%")

        # Store results
        all_results[experiment_name] = {
            'evaluation': evaluation_results,
            'best_model_path': best_model_path,
            'train_dataset': combination['train']['dataset'],
            'train_split': combination['train']['split'],
            'test_dataset': combination['test']['dataset'],
            'test_split': combination['test']['split'],
            'input_dim': input_dim
        }

    # Save all results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_path = os.path.join(results_dir, f"experiment_results_{timestamp}.json")

    # Convert numpy values to native Python types for JSON serialization
    serializable_results = {}
    for experiment_name, results in all_results.items():
        serializable_results[experiment_name] = {
            'evaluation': {
                'test_loss': float(results['evaluation']['test_loss']),
                'accuracy': float(results['evaluation']['accuracy']),
                'precision': float(results['evaluation']['precision']),
                'recall': float(results['evaluation']['recall']),
                'f1_score': float(results['evaluation']['f1_score']),
                'confusion_matrix': results['evaluation']['confusion_matrix']
            },
            'best_model_path': results['best_model_path'],
            'train_dataset': results['train_dataset'],
            'train_split': results['train_split'],
            'test_dataset': results['test_dataset'],
            'test_split': results['test_split'],
            'input_dim': int(results['input_dim'])
        }

    with open(results_path, 'w') as f:
        json.dump(serializable_results, f, indent=4)

    print(f"\nAll experiment results saved to {results_path}")
    return all_results


# Function to create a demonstration with dummy data if no real datasets are available
def create_demo_datasets(num_datasets=3, samples_per_dataset=1000, output_dir="datasets"):
    """Create demo datasets for testing purposes"""
    dataset_paths = {}
    os.makedirs(output_dir, exist_ok=True)

    for i in range(1, num_datasets + 1):
        # Create random feature dimensions to simulate different datasets
        feature_dim = np.random.randint(20, 50)

        # Generate random data
        X = np.random.randn(samples_per_dataset, feature_dim)

        # Generate labels with some class imbalance
        imbalance_ratio = np.random.uniform(0.1, 0.3)  # 10-30% minority class
        num_positive = int(samples_per_dataset * imbalance_ratio)
        y = np.zeros(samples_per_dataset)
        y[:num_positive] = 1
        np.random.shuffle(y)

        cols = [f"feature_{j}" for j in range(feature_dim)]
        df = pd.DataFrame(X, columns=cols)
        df['Label'] = y.astype(int)

        
        filename = f"{output_dir}/dataset_{i}.csv"
        df.to_csv(filename, index=False)

        dataset_paths[f"Dataset_{i}"] = filename

    print(f"Created {num_datasets} demo datasets in {output_dir}")
    return dataset_paths


if __name__ == "__main__":
    
    results = run_experiments(
        data_dir="datasets",          
        num_epochs=50,                 
        model_dir="models",            
        results_dir="results",         
        cross_dataset=True,            
        feature_alignment="pad_truncate", 
        batch_size=32                  
    )

    print("\nSummary of cross-dataset performance:")
    for experiment_name, experiment_results in results.items():
        if experiment_results['train_dataset'] != experiment_results['test_dataset']:
            eval_results = experiment_results['evaluation']
            print(f"\n{experiment_name}:")
            print(f"  Train: {experiment_results['train_dataset']}")
            print(f"  Test: {experiment_results['test_dataset']}")
            print(f"  Accuracy: {eval_results['accuracy'] * 100:.2f}%")
            print(f"  F1 Score: {eval_results['f1_score'] * 100:.2f}%")