In [10]:
import numpy as np
import pandas as pd
import dask.dataframe as dk
import tensorflow as tf
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

file_path = "Processed_Data/Mapped_Dataset.csv"

df = dk.read_csv(file_path)

In [11]:
#Global var 
batch_size = 512
ratio_test_all = 0.15

from dask_ml.model_selection import train_test_split 
# chia train test ratio 0.8:0.2 & random 
train_df, test_df = train_test_split(df, test_size=ratio_test_all, random_state=42)

# # load từng batch
def dask_to_tf_dataset(dask_df, batch_size=128, num_classes=10): 
    def generator():
        for batch in dask_df.to_delayed():
            batch=batch.compute()  
            if batch.empty:
                continue

            X = batch.drop(columns='label').values.astype(np.float32)
            y = batch['label'].values
            y_onehot = to_categorical(y, num_classes=num_classes)  

            num_splits = max(1, len(X) // batch_size)  # Đảm bảo không chia nhỏ quá mức
            X_batches = np.array_split(X, num_splits)
            y_batches = np.array_split(y_onehot, num_splits)

            for X_batch, y_batch in zip(X_batches, y_batches):
                yield X_batch, y_batch
                
    output_signature = ( 
        tf.TensorSpec(shape=(None, 46), dtype=tf.float32), 
        tf.TensorSpec(shape=(None, 10), dtype=tf.int32),
    )
    
    return tf.data.Dataset.from_generator(generator, output_signature=output_signature).prefetch(tf.data.AUTOTUNE)

# from functools import partial
# def dask_generator(dask_df, batch_size=128, num_classes=10): 
#     for batch in dask_df.to_delayed():
#         batch = batch.compute()  
#         if batch.empty:
#             continue

#         X = batch.drop(columns='label').values.astype(np.float32)
#         y = batch['label'].values
#         y_onehot = to_categorical(y, num_classes=num_classes)  

#         num_splits = max(1, len(X) // batch_size)
#         X_batches = np.array_split(X, num_splits)
#         y_batches = np.array_split(y_onehot, num_splits)

#         for X_batch, y_batch in zip(X_batches, y_batches):
#             yield X_batch, y_batch 

# def dask_to_tf_dataset(dask_df, batch_size=128, num_classes=10): 
#     output_signature = ( 
#         tf.TensorSpec(shape=(None, dask_df.shape[1] - 1), dtype=tf.float32),  
#         tf.TensorSpec(shape=(None, num_classes), dtype=tf.float32),
#     )

#     return tf.data.Dataset.from_generator(
#         partial(dask_generator, dask_df, batch_size, num_classes), 
#         output_signature=output_signature
#     ).prefetch(tf.data.AUTOTUNE)



In [12]:
train_gen = dask_to_tf_dataset(train_df, 512, 10).repeat()
test_gen = dask_to_tf_dataset(test_df, 512, 10).repeat()

In [None]:

# shape
features, labels = next(iter(train_gen))
input_shape = (features.shape[1], 1)
output_shape = labels.shape[1]

print(f"Input Shape: {input_shape}")

from tensorflow import keras
# Định nghĩa mô hình CNN
# VGG, ...
# Conv2D, tabular, ...
# HE, tính tương thích của HE với CNN
# Tính chất data in, out; Học tăng cường

model = keras.Sequential([
    layers.Input(shape=input_shape),
    layers.Conv1D(filters=32, kernel_size=3, padding="same", activation="relu"),
    layers.MaxPooling1D(pool_size=4),
    layers.Conv1D(filters=64, kernel_size=3,  padding="same",activation="relu"),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(output_shape, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# for batch in dataloader:
#     X_batch = batch[:, :-1]
#     y_batch = batch[:, -1]
#     y_onehot = to_categorical(y_batch, num_classes=10)
    
#     model.train_on_batch(X_batch, y_onehot, verbose=1)
model.fit(train_gen, epochs=10, steps_per_epoch=71000, verbose = 1)

# Lưu mô hình
model.save("cnn_model_2-0_batch512_test015.h5")

Input Shape: (46, 1)
Epoch 1/10
[1m 1891/71000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:21[0m 11ms/step - accuracy: 0.6849 - loss: 1.0081

# Load Model Để Test


In [None]:
from tensorflow.keras.models import load_model

# Load model từ file .h5
model = load_model("cnn_model_2-0_batch512_test015.h5")

# Test với dữ liệu đầu vào
import numpy as np
output = model.evaluate(test_gen, steps= 190000)
print(f'Loss: {output[1]} Acc: {output[1]}')



[1m190000/190000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4770s[0m 25ms/step - accuracy: 0.8165 - loss: 0.5172
Loss: 0.8166347742080688 Acc: 0.8166347742080688
