Import Library


In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import pandas as pd
import io
import os
import requests
import numpy as np
from sklearn import metrics

Import Data

In [30]:
df = pd.read_csv("traffic.csv",na_values=["NA","?"])
df.drop("time_interval",inplace=True , axis=1)

COLS_USED = ["VinNorthL","totalNorthL","VinSouthL","totalSouthL","VinEastL","totalEastL","VinWestL","totalWestL","time_interval","sequence"]
COLS_TRAIN = ["VinNorthL","totalNorthL","VinSouthL","totalSouthL","VinEastL","totalEastL","VinWestL","totalWestL","time_interval","sequence"]

# Handle missing value
# df = df[COLS_USED]
# df["VinNorthL"] = df['VinNorthL'].fillna(df['VinNorthL'].median())

# Split into training and test sets
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(
    df.drop("VinNorthL", axis=1),
    df["VinNorthL"],
    test_size=0.20,
    #shuffle=False,
    random_state=42,
)

# Create dataframe versions for tabular GAN
df_x_test, df_y_test = df_x_test.reset_index(drop=True), \
  df_y_test.reset_index(drop=True)
df_y_train = pd.DataFrame(df_y_train)
df_y_test = pd.DataFrame(df_y_test)

# Pandas to Numpy
x_train = df_x_train.values
x_test = df_x_test.values
y_train = df_y_train.values
y_test = df_y_test.values

In [31]:
df

Unnamed: 0,VinNorthL,totalNorthL,VinSouthL,totalSouthL,VinEastL,totalEastL,VinWestL,totalWestL,sequence
0,-6,0,0,0,1,1,3,3,0
1,0,0,3,3,-6,0,2,5,1
2,3,3,-4,0,3,3,2,7,2
3,2,5,0,0,1,4,-3,4,3
4,-6,0,0,0,3,7,2,6,0
...,...,...,...,...,...,...,...,...,...
195,1,13,0,7,1,17,-5,8,3
196,-6,7,1,8,2,19,0,8,0
197,1,8,0,8,-3,16,3,11,1
198,3,11,-6,2,0,16,0,11,2


In [32]:
print(x_train)

[[19  1 14 ... -4  0  3]
 [ 8  0  8 ...  3 11  1]
 [ 6 -6  0 ...  0  5  2]
 ...
 [17  1  7 ...  3  6  0]
 [13  1  6 ... -6  8  3]
 [18 -4  3 ...  0  9  2]]


Model


In [33]:
# Build the neural network
model = Sequential()
# Hidden 1
model.add(Dense(50, input_dim=x_train.shape[1], activation='relu')) 
model.add(Dense(25, activation='relu')) # Hidden 2
model.add(Dense(12, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
        patience=5, verbose=1, mode='auto',
        restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),
        callbacks=[monitor], verbose=2,epochs=1000)

Epoch 1/1000
5/5 - 1s - loss: 10.8520 - val_loss: 11.4529 - 680ms/epoch - 136ms/step
Epoch 2/1000
5/5 - 0s - loss: 8.4884 - val_loss: 9.3408 - 27ms/epoch - 5ms/step
Epoch 3/1000
5/5 - 0s - loss: 7.3173 - val_loss: 8.1466 - 26ms/epoch - 5ms/step
Epoch 4/1000
5/5 - 0s - loss: 6.6308 - val_loss: 7.5243 - 27ms/epoch - 5ms/step
Epoch 5/1000
5/5 - 0s - loss: 6.1563 - val_loss: 7.0649 - 26ms/epoch - 5ms/step
Epoch 6/1000
5/5 - 0s - loss: 5.7246 - val_loss: 6.6000 - 27ms/epoch - 5ms/step
Epoch 7/1000
5/5 - 0s - loss: 5.2355 - val_loss: 6.0646 - 28ms/epoch - 6ms/step
Epoch 8/1000
5/5 - 0s - loss: 4.7347 - val_loss: 5.3657 - 26ms/epoch - 5ms/step
Epoch 9/1000
5/5 - 0s - loss: 4.2853 - val_loss: 4.5506 - 48ms/epoch - 10ms/step
Epoch 10/1000
5/5 - 0s - loss: 3.8119 - val_loss: 3.7429 - 25ms/epoch - 5ms/step
Epoch 11/1000
5/5 - 0s - loss: 3.5802 - val_loss: 3.1877 - 27ms/epoch - 5ms/step
Epoch 12/1000
5/5 - 0s - loss: 3.2945 - val_loss: 3.0202 - 26ms/epoch - 5ms/step
Epoch 13/1000
5/5 - 0s - loss: 

<keras.src.callbacks.History at 0x1cf43d9d510>

In [34]:
pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 1.3152434338642673


In [39]:
from tabgan.sampler import GANGenerator
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

gen_x, gen_y = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.850, \
              is_post_process=True,
           adversarial_model_params={
               "metrics": "rmse", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": \
                42, "n_estimators": 500,
           }, pregeneration_frac=2, only_generated_data=False).generate_data_pipe(df_x_train,df_y_train,\
         #   gan_params = {"batch_size": 500, "patience": 25, \
         #  "epochs" : 500,}).generate_data_pipe(df_x_train, df_y_train,\
          df_x_test, deep_copy=True, only_adversarial=False, \
          use_adversarial=True)


Fitting CTGAN transformers for each column:   0%|          | 0/9 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 32, number of negative: 32
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32, number of negative: 32
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32, number of negative: 32
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing

In [40]:
gen_x

Unnamed: 0,totalNorthL,VinSouthL,totalSouthL,VinEastL,totalEastL,VinWestL,totalWestL,sequence
0,11,-6,2,0,16,0,11,2
1,3,1,6,-3,16,0,17,3
2,0,0,3,1,20,3,9,3
3,9,1,6,3,19,2,11,0
4,0,-3,6,2,19,-4,13,2
...,...,...,...,...,...,...,...,...
187,15,-3,10,0,10,0,5,2
188,14,0,11,-3,12,2,4,1
189,15,1,10,1,15,1,5,0
190,14,1,10,3,13,0,4,0


In [37]:
# Predict
pred = model.predict(gen_x.values)
score = np.sqrt(metrics.mean_squared_error(pred,gen_y.values))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 2.9161524030876436


In [14]:
df.to_csv("traffic_w_gan.csv")