In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv("data.csv")  
features = ['Machine', 'DebugSize', 'MajorImageVersion', 'ExportSize', 
            'IatVRA', 'NumberOfSections', 'SizeOfStackReserve', 
            'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses']

df = df[features]
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

# Convert to PyTorch tensors
data = torch.tensor(df_scaled, dtype=torch.float32)

# --------------------------
# Define Generator
# --------------------------
class Generator(nn.Module):
    def __init__(self, latent_dim, feature_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, feature_dim),
            nn.Sigmoid()
        )
    
    def forward(self, z):
        return self.model(z)

# --------------------------
# Define Discriminator
# --------------------------
class Discriminator(nn.Module):
    def __init__(self, feature_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

# --------------------------
# Initialize Models and Optimizers
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 10
feature_dim = len(features)
G = Generator(latent_dim, feature_dim).to(device)
D = Discriminator(feature_dim).to(device)
optimizer_G = optim.Adam(G.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = optim.Adam(D.parameters(), lr=0.0002, betas=(0.5, 0.999))
criterion = nn.BCELoss()

# --------------------------
# Train GAN
# --------------------------
n_epochs = 100
batch_size = 64
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

for epoch in range(n_epochs):
    for real_data in data_loader:
        real_data = real_data.to(device)
        batch_size = real_data.size(0)

        # Train Discriminator
        optimizer_D.zero_grad()
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)
        real_loss = criterion(D(real_data), real_labels)
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_data = G(z)
        fake_loss = criterion(D(fake_data.detach()), fake_labels)
        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        optimizer_G.zero_grad()
        gen_loss = criterion(D(fake_data), real_labels)
        gen_loss.backward()
        optimizer_G.step()
    
    print(f"Epoch [{epoch+1}/{n_epochs}] | D Loss: {d_loss.item():.4f} | G Loss: {gen_loss.item():.4f}")

# Save the trained models
with open("generator.pkl", "wb") as f:
    pickle.dump(G, f)
with open("discriminator.pkl", "wb") as f:
    pickle.dump(D, f)

print("Models saved as generator.pkl and discriminator.pkl")

# Generate and Print Synthetic Data
z = torch.randn(10, latent_dim).to(device)
generated_data = G(z).cpu().detach().numpy()
print("Generated Data:")
print(scaler.inverse_transform(generated_data))


Epoch [1/100] | D Loss: 1.2242 | G Loss: 0.7832
Epoch [2/100] | D Loss: 2.2649 | G Loss: 0.6394
Epoch [3/100] | D Loss: 0.5141 | G Loss: 1.4556
Epoch [4/100] | D Loss: 0.8721 | G Loss: 1.0370
Epoch [5/100] | D Loss: 1.1507 | G Loss: 0.9059
Epoch [6/100] | D Loss: 1.1648 | G Loss: 1.0345
Epoch [7/100] | D Loss: 1.6258 | G Loss: 0.8151
Epoch [8/100] | D Loss: 0.9099 | G Loss: 1.0377
Epoch [9/100] | D Loss: 1.1266 | G Loss: 1.0647
Epoch [10/100] | D Loss: 1.0379 | G Loss: 1.1276
Epoch [11/100] | D Loss: 1.2596 | G Loss: 0.8560
Epoch [12/100] | D Loss: 1.0583 | G Loss: 0.9820
Epoch [13/100] | D Loss: 1.3213 | G Loss: 0.8313
Epoch [14/100] | D Loss: 1.2403 | G Loss: 0.8025
Epoch [15/100] | D Loss: 1.2772 | G Loss: 0.7735
Epoch [16/100] | D Loss: 1.2901 | G Loss: 0.8173
Epoch [17/100] | D Loss: 1.3058 | G Loss: 0.7530
Epoch [18/100] | D Loss: 1.2322 | G Loss: 0.7997
Epoch [19/100] | D Loss: 1.2193 | G Loss: 0.7995
Epoch [20/100] | D Loss: 1.0952 | G Loss: 0.9630
Epoch [21/100] | D Loss: 1.10

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# --------------------------
# Load and Prepare the Data
# --------------------------
df = pd.read_csv("data.csv")  # Ensure the dataset is in the correct path
features = ['Machine', 'DebugSize', 'MajorImageVersion', 'ExportSize', 
            'IatVRA', 'NumberOfSections', 'SizeOfStackReserve', 
            'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses']
target = 'Benign'  # Define the target column

df = df[features + [target]]
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df[features])

# Convert to PyTorch tensors
data = torch.tensor(df_scaled, dtype=torch.float32)
labels = torch.tensor(df[target].values, dtype=torch.float32)

# --------------------------
# Define Generator
# --------------------------
class Generator(nn.Module):
    def __init__(self, latent_dim, feature_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, feature_dim),
            nn.Sigmoid()
        )
    
    def forward(self, z):
        return self.model(z)

# --------------------------
# Define Discriminator
# --------------------------
class Discriminator(nn.Module):
    def __init__(self, feature_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

# --------------------------
# Initialize Models and Optimizers
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 10
feature_dim = len(features)
G = Generator(latent_dim, feature_dim).to(device)
D = Discriminator(feature_dim).to(device)
optimizer_G = optim.Adam(G.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = optim.Adam(D.parameters(), lr=0.0002, betas=(0.5, 0.999))
criterion = nn.BCELoss()

# --------------------------
# Train GAN
# --------------------------
n_epochs = 100
batch_size = 64
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

for epoch in range(n_epochs):
    for real_data in data_loader:
        real_data = real_data.to(device)
        batch_size = real_data.size(0)

        # Train Discriminator
        optimizer_D.zero_grad()
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)
        real_loss = criterion(D(real_data), real_labels)
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_data = G(z)
        fake_loss = criterion(D(fake_data.detach()), fake_labels)
        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        optimizer_G.zero_grad()
        gen_loss = criterion(D(fake_data), real_labels)
        gen_loss.backward()
        optimizer_G.step()
    
    print(f"Epoch [{epoch+1}/{n_epochs}] | D Loss: {d_loss.item():.4f} | G Loss: {gen_loss.item():.4f}")

# Save the trained models
with open("generator.pkl", "wb") as f:
    pickle.dump(G, f)
with open("discriminator.pkl", "wb") as f:
    pickle.dump(D, f)

print("Models saved as generator.pkl and discriminator.pkl")

# --------------------------
# Generate Synthetic Data
# --------------------------
z = torch.randn(len(df), latent_dim).to(device)
generated_data = G(z).cpu().detach().numpy()
generated_data = scaler.inverse_transform(generated_data)

# --------------------------
# Train ML Model on Synthetic Data
# --------------------------
df_generated = pd.DataFrame(generated_data, columns=features)
df_generated[target] = df[target].sample(frac=1, random_state=42).values  # Assign real labels

# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    df_generated[features], df_generated[target], test_size=0.2, random_state=42
)

# Train RandomForest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate
print("\nMachine Learning Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the ML model
with open("ml_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

print("Machine learning model saved as ml_model.pkl")


Epoch [1/100] | D Loss: 1.0834 | G Loss: 1.0853
Epoch [2/100] | D Loss: 1.2615 | G Loss: 0.7242
Epoch [3/100] | D Loss: 1.0132 | G Loss: 1.0315
Epoch [4/100] | D Loss: 1.1721 | G Loss: 0.9152
Epoch [5/100] | D Loss: 1.0872 | G Loss: 0.9221
Epoch [6/100] | D Loss: 1.3243 | G Loss: 0.9060
Epoch [7/100] | D Loss: 0.9844 | G Loss: 1.0177
Epoch [8/100] | D Loss: 0.8376 | G Loss: 1.0926
Epoch [9/100] | D Loss: 0.9388 | G Loss: 1.1214
Epoch [10/100] | D Loss: 1.9248 | G Loss: 0.8697
Epoch [11/100] | D Loss: 0.8560 | G Loss: 1.0711
Epoch [12/100] | D Loss: 1.0466 | G Loss: 0.9560
Epoch [13/100] | D Loss: 1.1328 | G Loss: 0.8988
Epoch [14/100] | D Loss: 1.1131 | G Loss: 0.9054
Epoch [15/100] | D Loss: 1.2714 | G Loss: 0.7616
Epoch [16/100] | D Loss: 1.2759 | G Loss: 0.7979
Epoch [17/100] | D Loss: 1.1479 | G Loss: 0.8356
Epoch [18/100] | D Loss: 1.2984 | G Loss: 0.7895
Epoch [19/100] | D Loss: 1.1788 | G Loss: 0.9093
Epoch [20/100] | D Loss: 1.2161 | G Loss: 0.8694
Epoch [21/100] | D Loss: 1.10