### Load dataset

In [1]:
import pandas as pd

In [2]:
DATASET_DIRS = "./Datasets/dataset_verysmall_balanced.pkl"

In [3]:
dataset = pd.read_pickle(DATASET_DIRS)

In [None]:
DATASET_DIRS = []
for hp in range(1000, 10001, 100):
    for df in range(10, 101, 10):
        DATASET_DIRS.append("./Datasets/Dataset_verysmall_df10-100_hp1000-10000/dataset_100_df{}_hp{}.pkl".format(df, hp))

In [None]:
if isinstance(DATASET_DIRS, list):
    temp_loadeds = []
    for dataset_dir in DATASET_DIRS:
        temp_loadeds.append(pd.read_pickle(dataset_dir))
    loaded = pd.concat(temp_loadeds, ignore_index=True)
else:
    loaded = pd.read_pickle(DATASET_DIRS)

### Preprocess data

In [4]:
import numpy as np

In [5]:
MAX_MONSTER_NUM = 1000
MONSTER_HP_COLUMNS = ["monster_hp_" + str(num) for num in range(1, MAX_MONSTER_NUM + 1)]
FEATURES = ["focus_damage", "aoe_damage", *MONSTER_HP_COLUMNS]
TARGET = ["attack_num"]

In [None]:
temp_hps = np.zeros((loaded["monster_hps"].shape[0], MAX_MONSTER_NUM), dtype=int)
for row in range(len(loaded["monster_hps"])):
    # Sorted and padding
    temp_hps[row, :len(loaded["monster_hps"][row])] = sorted(loaded["monster_hps"][row], reverse=True)
    
monster_hps = pd.DataFrame(temp_hps, columns=MONSTER_HP_COLUMNS)
dataset = pd.concat([loaded, monster_hps], axis=1, ignore_index=False).drop(columns="monster_hps")
dataset.drop_duplicates(inplace=True)

In [None]:
# Save processed dataset
dataset.to_pickle("./Datasets/dataset_verysmall.pkl")

### Balance dataset

### Train test split

In [6]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [7]:
bins = np.linspace(dataset[TARGET].to_numpy().min(), dataset[TARGET].to_numpy().max(), 100, dtype=int)
Y_bin = np.digitize(dataset[TARGET].to_numpy(), bins)

In [10]:
train_set, test_set = train_test_split(dataset, random_state=42, shuffle=True, stratify=Y_bin)

In [11]:
X_train, Y_train = train_set[FEATURES].to_numpy(), train_set[TARGET].to_numpy()
X_test, Y_test = test_set[FEATURES].to_numpy(), test_set[TARGET].to_numpy()

### Normalization

In [19]:
import joblib
from sklearn.preprocessing import MinMaxScaler

In [13]:
X_scaler = MinMaxScaler()
X_train_scaled = X_scaler.fit_transform(X_train.astype(np.float32))

Y_scaler = MinMaxScaler()
Y_train_scaled = Y_scaler.fit_transform(Y_train.astype(np.float32))

In [20]:
joblib.dump(Y_scaler, "./Checkpoints/Y_scaler.pkl")
joblib.dump(X_scaler, "./Checkpoints/X_scaler.pkl")

['./Checkpoints/X_scaler.pkl']

In [17]:
print(Y_scaler.min_)
print(Y_scaler.scale_)

[-0.00031433]
[0.00010478]


### Model searching

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import SGD, Adam

In [None]:
def history_plot(history):
    x = list(range(len(history.history["loss"])))
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    plt.subplot(2, 1, 1)
    plt.title("loss")
    plt.plot(x, loss)
    plt.subplot(2, 1, 2)
    plt.title("val_loss")
    plt.plot(x, val_loss)
    
def prediction_distribution(pred):
    plt.hist(pred, bins=100)
    plt.show()

In [None]:
input_shape = [X_train.shape[1]]

In [None]:
class SequenceDense(Model):
    def __init__(self):
        super().__init__()
        self.dense_1 = Dense(16, activation="relu")
        self.dense_2 = Dense(1)
        
    def call(self, inputs):
        output = self.dense_1(inputs)
        output = self.dense_2(output)
        
        return output

class NonSequenceDense(Model):
    def __init__(self):
        super().__init__()
        self.dense_1 = Dense(16, activation="relu")
        self.dense_2 = Dense(1)
        
    def call(self, inputs):
        hps, damages = tf.split(inputs, [1000, 2], axis=1)
        output = self.dense_1(hps)
        output = tf.concat([output, damages], axis=1)
        output = self.dense_2(output)
        
        return output

In [None]:
# Inbalance dataset is highly possible due to the following prediction distriburion.
pred = model1.predict(X_train)
prediction_distribution(pred)

In [None]:
model2 = SequenceDense()

model2.compile(optimizer=Adam(learning_rate=0.0001), loss="mae")
history = model2.fit(X_train_scaled, Y_train_scaled, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

history_plot(history)

In [None]:
# Inbalance dataset is highly possible due to the following prediction distriburion.
pred = model2.predict(X_train_scaled)
prediction_distribution(pred)

In [None]:
model2.save_weights("./Checkpoints/SequenceDenseBalanced")

In [None]:
model3 = NonSequenceDense()

model3.compile(optimizer=Adam(learning_rate=0.0001), loss="mae")
history = model3.fit(X_train_scaled, Y_train_scaled, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

history_plot(history)

In [None]:
# Inbalance dataset is highly possible due to the following prediction distriburion.
pred = model3.predict(X_train)
prediction_distribution(pred)

### Possible cause of problem
1. Inbalance dataset -> See prediction distribution -> Feed balance dataset
2. Size of dataset -> Generate more data
3. Scale of dataset -> Add Normalization
4. Model's architecture -> try other architectures
5. Size of model -> Increate model's size

In [None]:
# Inbalance dataset is highly possible due to the following prediction distriburion.
pred = model.predict(X_train)
prediction_distribution(pred)