In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, LeakyReLU, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop

In [2]:
# Load the preprocessed dataset
data = pd.read_csv("wgan_dataset.csv")

# Convert the dataset to a numpy array
data_array = data.to_numpy()

In [3]:
data.shape

(11430, 47)

In [4]:
data['status'].value_counts()

status
0    5715
1    5715
Name: count, dtype: int64

In [5]:
def gradient_penalty(critic, real_samples, fake_samples):
    alpha = tf.random.uniform((real_samples.shape[0], 1), 0.0, 1.0)
    interpolates = alpha * real_samples + (1 - alpha) * fake_samples
    with tf.GradientTape() as tape:
        tape.watch(interpolates)
        critic_output = critic(interpolates)
    gradients = tape.gradient(critic_output, interpolates)
    gradient_norm = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=1))
    penalty = tf.reduce_mean((gradient_norm - 1.0) ** 2)
    return penalty


In [6]:
def build_generator(input_dim, output_dim):
    input_layer = Input(shape=(input_dim,))
    x = Dense(128)(input_layer)
    x = LeakyReLU(0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(256)(x)
    x = LeakyReLU(0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(output_dim, activation='tanh')(x)
    return Model(input_layer, x)

In [7]:
def build_critic(input_dim):
    input_layer = Input(shape=(input_dim,))
    x = Dense(256)(input_layer)
    x = LeakyReLU(0.2)(x)
    x = Dense(128)(x)
    x = LeakyReLU(0.2)(x)
    x = Dense(1)(x)
    return Model(input_layer, x)

In [8]:
# Hyperparameters
input_dim = 100  # Noise dimension for generator
output_dim = data_array.shape[1]  # Number of features
batch_size = 64
epochs = 10000
critic_steps = 5  # Number of critic updates per generator update
gp_weight = 10  # Gradient penalty weight

# Initialize generator and critic
generator = build_generator(input_dim, output_dim)
critic = build_critic(output_dim)

# Optimizers
critic_optimizer = RMSprop(learning_rate=0.00005)
generator_optimizer = RMSprop(learning_rate=0.00005)

# Training loop
for epoch in range(epochs):
    for _ in range(critic_steps):
        # Train the critic
        real_samples = tf.convert_to_tensor(data_array[np.random.randint(0, data_array.shape[0], batch_size)], dtype=tf.float32)
        noise = tf.random.normal((batch_size, input_dim))
        fake_samples = generator(noise)
        with tf.GradientTape() as tape:
            real_output = critic(real_samples)
            fake_output = critic(fake_samples)
            gp = gradient_penalty(critic, real_samples, fake_samples)
            critic_loss = tf.reduce_mean(fake_output) - tf.reduce_mean(real_output) + gp_weight * gp
        grads = tape.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(grads, critic.trainable_variables))

    # Train the generator
    noise = tf.random.normal((batch_size, input_dim))
    with tf.GradientTape() as tape:
        fake_samples = generator(noise)
        fake_output = critic(fake_samples)
        generator_loss = -tf.reduce_mean(fake_output)
    grads = tape.gradient(generator_loss, generator.trainable_variables)
    generator_optimizer.apply_gradients(zip(grads, generator.trainable_variables))

    # Print losses
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Critic Loss: {critic_loss.numpy()}, Generator Loss: {generator_loss.numpy()}")


Epoch 0, Critic Loss: 2.290828227996826, Generator Loss: 0.18195420503616333
Epoch 100, Critic Loss: -3.5545647144317627, Generator Loss: -2.051039218902588
Epoch 200, Critic Loss: -2.054831027984619, Generator Loss: -3.58658504486084
Epoch 300, Critic Loss: -1.7173582315444946, Generator Loss: -2.700023651123047
Epoch 400, Critic Loss: -1.3475443124771118, Generator Loss: -2.3401126861572266
Epoch 500, Critic Loss: -1.3236554861068726, Generator Loss: -2.4573609828948975
Epoch 600, Critic Loss: -1.17025625705719, Generator Loss: -2.4853532314300537
Epoch 700, Critic Loss: -1.01381516456604, Generator Loss: -2.61824631690979
Epoch 800, Critic Loss: -0.8775917887687683, Generator Loss: -2.6018965244293213
Epoch 900, Critic Loss: -0.8461257815361023, Generator Loss: -2.7395362854003906
Epoch 1000, Critic Loss: -0.6964156031608582, Generator Loss: -2.856597661972046
Epoch 1100, Critic Loss: -0.7628809809684753, Generator Loss: -2.728287696838379
Epoch 1200, Critic Loss: -0.694318115711212

In [9]:
# Generate synthetic data
num_samples = 5000  # Number of synthetic samples to generate
noise = tf.random.normal((num_samples, input_dim))
synthetic_data = generator(noise).numpy()

In [30]:
# Save the synthetic dataset
synthetic_df = pd.DataFrame(synthetic_data, columns=data.columns)
# synthetic_df.to_csv("synthetic_dataset1.csv", index=False)

In [31]:
# synthetic_df.head()
synthetic_df.head()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_slash,nb_semicolumn,...,safe_anchor,empty_title,domain_in_title,domain_with_copyright,domain_registration_length,domain_age,dns_record,google_index,page_rank,status
0,-0.975577,0.0,0.968555,-0.905084,-0.948305,-0.997928,-0.944212,-0.999067,-0.843977,-0.991314,...,0.844074,-1.0,-0.913613,-0.996481,-0.995709,-0.759644,-0.998164,1.0,-0.628078,0.891679
1,-0.999013,0.0,-1.0,-0.91654,-0.978985,-1.0,-0.993636,-0.99985,-0.855409,-0.999789,...,-0.95241,-1.0,-1.0,-0.999998,-0.99971,0.671134,-0.999999,-0.999998,0.107136,-0.282725
2,-0.99147,0.0,-1.0,-0.951744,-0.99811,-1.0,-0.999888,-0.999661,-0.953221,-0.999966,...,0.390918,-1.0,1.0,1.0,-0.999413,-0.784167,-1.0,-1.0,-0.082629,-0.007084
3,-0.846905,0.0,0.679958,-0.14043,-0.949178,-0.995959,-0.848644,-0.606338,-0.706675,-0.903564,...,0.171343,-0.995098,0.999997,-0.913248,-0.991559,0.089532,-0.996767,0.999999,-0.638453,0.940375
4,-0.563017,0.0,-1.0,-0.97669,-0.955488,-0.992141,-0.958409,-0.999761,-0.768809,-0.9985,...,-0.997128,0.484423,1.0,-1.0,-0.999732,-0.747233,-0.999977,1.0,-0.919935,0.999957


In [42]:
synthetic_df['status'] = synthetic_df['status'].apply(lambda x: 1 if x >= 0.5 else 0)

In [43]:
synthetic_df['status'].value_counts()
# synthetic_df['status'].value_counts()

status
1    2538
0    2462
Name: count, dtype: int64

In [44]:
synthetic_df.columns

Index(['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at',
       'nb_qm', 'nb_and', 'nb_slash', 'nb_semicolumn', 'nb_www', 'nb_com',
       'https_token', 'ratio_digits_url', 'ratio_digits_host',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'shortening_service', 'length_words_raw',
       'shortest_word_host', 'longest_words_raw', 'longest_word_host',
       'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints',
       'suspecious_tld', 'statistical_report', 'nb_hyperlinks',
       'ratio_intHyperlinks', 'ratio_extRedirection', 'external_favicon',
       'links_in_tags', 'ratio_intMedia', 'ratio_extMedia', 'safe_anchor',
       'empty_title', 'domain_in_title', 'domain_with_copyright',
       'domain_registration_length', 'domain_age', 'dns_record',
       'google_index', 'page_rank', 'status'],
      dtype='object')

In [45]:
# Save the synthetic dataset
# synthetic_df = pd.DataFrame(synthetic_data, columns=data.columns)
synthetic_df.to_csv("synthetic_dataset1.csv", index=False)

In [46]:
# Combine with original data
combined_data = pd.concat([data, synthetic_df], ignore_index=True)

In [47]:
combined_data.shape

(16430, 47)

In [48]:
combined_data.head()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_slash,nb_semicolumn,...,safe_anchor,empty_title,domain_in_title,domain_with_copyright,domain_registration_length,domain_age,dns_record,google_index,page_rank,status
0,-0.969306,-0.857143,-1.0,-0.826087,-1.0,-1.0,-1.0,-1.0,-0.935484,-1.0,...,-1.0,-1.0,-1.0,1.0,-0.996916,-0.998293,1.0,1.0,-0.2,0
1,-0.920196,-0.819048,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.806452,-1.0,...,1.0,-1.0,1.0,-1.0,-0.99477,-0.103058,-1.0,1.0,-0.6,1
2,-0.860037,-0.561905,1.0,-0.73913,-0.953488,-1.0,-0.333333,-0.789474,-0.806452,-1.0,...,1.0,-1.0,1.0,-1.0,-0.998994,-0.376688,-1.0,1.0,-1.0,1
3,-0.992634,-0.933333,-1.0,-0.913043,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.25,-1.0,1.0,-1.0,-0.995776,-0.998293,-1.0,-1.0,-0.4,0
4,-0.947207,-0.895238,-1.0,-0.913043,-0.906977,-1.0,-1.0,-1.0,-0.806452,-1.0,...,-1.0,-1.0,-1.0,1.0,-0.984915,0.270681,-1.0,-1.0,0.2,0


In [50]:
combined_data['status'].value_counts()

status
1    8253
0    8177
Name: count, dtype: int64

In [52]:
# Save the combined dataset to CSV
combined_data.to_csv('combined_dataset1.csv', index=False)