In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report

In [2]:
def load_and_preprocess_user_data(file_path):
    # Load data
    data = pd.read_csv(file_path)

    # Extract hour from timestamp
    data['hour'] = pd.to_datetime(data['timestamp']).dt.hour

    # Encode categorical variables
    categorical_columns = ['bus_route', 'station_id', 'weather_condition', 'day_of_week', 'card_type', 'boarding_type']
    encoders = {}
    for col in categorical_columns:
        encoders[col] = LabelEncoder()
        data[col] = encoders[col].fit_transform(data[col])

    # Select features and target
    features = ['hour', 'bus_route', 'station_id', 'weather_condition', 'day_of_week', 'holiday_flag', 'card_type', 'boarding_type']
    target = 'hourly_boarding_count'
    X = data[features]
    y = (data[target] > 10).astype(int)  # Binary classification: high (>10) or low (<=10) boarding demand

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, scaler, encoders

In [3]:
def build_generator(input_dim, output_dim):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dense(256, activation='relu'),
        layers.Dense(output_dim, activation='tanh')
    ])
    return model

In [4]:
def build_discriminator(input_dim):
    model = models.Sequential([
        layers.Dense(256, activation='relu', input_dim=input_dim),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

In [6]:
def build_dnn(input_dim):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [7]:
def save_model(model, file_path):
    model.save(file_path)

In [9]:
def train_gan(generator, discriminator, gan, X_train, epochs=10000, batch_size=128):
    half_batch = batch_size // 2

    for epoch in range(epochs):
        # Train discriminator
        idx = np.random.randint(0, X_train.shape[0], half_batch)
        real_data = X_train[idx]
        fake_data = generator.predict(np.random.normal(0, 1, (half_batch, X_train.shape[1])))

        d_loss_real = discriminator.train_on_batch(real_data, np.ones((half_batch, 1)))
        d_loss_fake = discriminator.train_on_batch(fake_data, np.zeros((half_batch, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train generator
        noise = np.random.normal(0, 1, (batch_size, X_train.shape[1]))
        g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

        # Print progress
        if epoch % 500 == 0:
            print(f"Epoch {epoch}/{epochs} - D loss: {d_loss[0]:.4f} - G loss: {g_loss:.4f}")


In [10]:
if __name__ == "__main__":
    # Load and preprocess user-provided dataset
    dataset_path = "expanded_bus_boarding_demand_dataset.csv"
    X_train, X_test, y_train, y_test, scaler, encoders = load_and_preprocess_user_data(dataset_path)

    # GAN setup
    input_dim = X_train.shape[1]
    generator = build_generator(input_dim=input_dim, output_dim=input_dim)
    discriminator = build_discriminator(input_dim=input_dim)
    discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Combine generator and discriminator
    discriminator.trainable = False
    gan_input = layers.Input(shape=(input_dim,))
    gan_output = discriminator(generator(gan_input))
    gan = models.Model(gan_input, gan_output)
    gan.compile(optimizer='adam', loss='binary_crossentropy')

    # Train GAN
    train_gan(generator, discriminator, gan, X_train)

    # Generate synthetic data
    noise = np.random.normal(0, 1, (X_train.shape[0], input_dim))
    synthetic_data = generator.predict(noise)

    # Combine real and synthetic data
    X_combined = np.vstack((X_train, synthetic_data))
    y_combined = np.hstack((y_train, np.ones(synthetic_data.shape[0])))

    # Train the DNN model
    dnn = build_dnn(input_dim=input_dim)
    dnn.fit(X_combined, y_combined, epochs=50, batch_size=128, validation_split=0.2)

    # Evaluate the model
    y_pred = (dnn.predict(X_test) > 0.5).astype(int)
    print(classification_report(y_test, y_pred))

    # Save models
    save_model(generator, "generator_model.h5")
    save_model(discriminator, "discriminator_model.h5")
    save_model(dnn, "dnn_model.h5")

Epoch 0/10000 - D loss: 0.6815 - G loss: 0.6616
Epoch 500/10000 - D loss: 0.2188 - G loss: 2.8646
Epoch 1000/10000 - D loss: 0.1561 - G loss: 3.1365
Epoch 1500/10000 - D loss: 0.0813 - G loss: 2.8971
Epoch 2000/10000 - D loss: 0.0173 - G loss: 4.5717
Epoch 2500/10000 - D loss: 0.0050 - G loss: 5.7059
Epoch 3000/10000 - D loss: 0.0003 - G loss: 8.1288
Epoch 3500/10000 - D loss: 0.0002 - G loss: 8.7415
Epoch 4000/10000 - D loss: 0.0001 - G loss: 9.4553
Epoch 4500/10000 - D loss: 0.0000 - G loss: 10.6847
Epoch 5000/10000 - D loss: 0.0000 - G loss: 10.9265
Epoch 5500/10000 - D loss: 0.0000 - G loss: 10.7273
Epoch 6000/10000 - D loss: 0.0000 - G loss: 11.9874
Epoch 6500/10000 - D loss: 0.0000 - G loss: 9.8976
Epoch 7000/10000 - D loss: 0.0000 - G loss: 12.7037
Epoch 7500/10000 - D loss: 0.0000 - G loss: 12.0352
Epoch 8000/10000 - D loss: 0.0001 - G loss: 11.7769
Epoch 8500/10000 - D loss: 0.0001 - G loss: 9.5924
Epoch 9000/10000 - D loss: 0.0000 - G loss: 11.5774
Epoch 9500/10000 - D loss: 