In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import time
import pickle
import gzip
import random

In [2]:
!python -m pip install --upgrade pip

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")  # 使用GPU
    print("cuda")
else:
    device = torch.device("cpu")  # 使用CPU
    print("cpu")
#input_file_path
#number of subset
#test_time_step
#epoch
#patience
#hyperpara
#output_file

cuda


In [3]:
input_file_path = './牌局/collection_onehot_handfeature_cleaned_'
start = time.time()
print("Fetching data, file name: ", (input_file_path+str(0)+".xlsx"))
raw_data = pd.read_excel(input_file_path+str(0)+".xlsx")
print("執行時間：%f 秒" % (time.time() - start))
for i in range(1, 2):
    start = time.time()
    print("Fetching data, file name: ", (input_file_path+str(i)+".xlsx"))
    raw_data = pd.concat([raw_data, pd.read_excel(input_file_path+str(i)+".xlsx")], axis = 0, ignore_index = True)
    print("執行時間：%f 秒" % (time.time() - start))

Fetching data, file name:  ./牌局/collection_onehot_handfeature_cleaned_0.xlsx
執行時間：460.651228 秒
Fetching data, file name:  ./牌局/collection_onehot_handfeature_cleaned_1.xlsx
執行時間：497.282576 秒


In [4]:
def split_and_pad(mydata):
    start = time.time()
    print("spliting dataset")
    current_sample_id = ''
    row_data_X = []
    row_data_y = []
    X = []
    y = []
    decoded_label = []
    sequence_lengths = []
    step = 0
    max_step = 0
    test_time_step = 1000
    test_size = 0
    cnt = 0
    flag = True
    for index, row in mydata.iterrows():
        if row['index'] != '':
            if (row['style'] != "nat"):
                continue
            if row['index'] == current_sample_id:
                row_data_X.append(row.iloc[57:188].tolist())
                row_data_y.append(row.iloc[189:227].tolist())
                if flag is True:
                    decoded_label.append(row.iloc[0:5].tolist())
                    cnt = cnt+1
                step = step+1
            else:
                if row_data_X != []:
                    X.append(row_data_X)
                    y.append(row_data_y)
                    sequence_lengths.append(step)
                    if step > max_step:
                        max_step = step
                step = 1
                row_data_X = []
                row_data_y = []
                row_data_X.append(row.iloc[57:188].tolist())
                row_data_y.append(row.iloc[189:227].tolist())
                if cnt >= test_time_step:
                    flag = False
                if flag is True:
                    decoded_label.append(row.iloc[0:5].tolist())
                    cnt = cnt+1
                    test_size = test_size+1
                current_sample_id = row['index']
    if row_data_X != []:
        X.append(row_data_X)
        y.append(row_data_y)
        sequence_lengths.append(step)
        if step > max_step:
            max_step = step
    print("執行時間：%f 秒" % (time.time() - start))
    start = time.time()
    print("padding dataset")
    void_X = [-1] * 131
    void_y = [-1] * 38
    for i in range(0, len(sequence_lengths)):
        padding_length = max_step - sequence_lengths[i]
        X[i].extend([void_X] * padding_length)
        y[i].extend([void_y] * padding_length)
    print("執行時間：%f 秒" % (time.time() - start))
    return X, y, sequence_lengths, decoded_label, test_size

In [17]:
X, y, sequence_lengths, decoded_label, test_size = split_and_pad(raw_data)

spliting dataset
執行時間：29.612218 秒
padding dataset
執行時間：0.043907 秒


In [18]:
start = time.time()
print("train test spliting")
test_X = X[:test_size]
test_y = y[:test_size]
n_sample = len(X)
train_size = int((len(X) - test_size)*0.8)
valid_size = len(X) - test_size - train_size
combined = list(zip(X[test_size:], y[test_size:], sequence_lengths[test_size:]))
random.shuffle(combined)
train_X, train_y, train_seq_len = zip(*combined[:train_size])
valid_X, valid_y, valid_seq_len = zip(*combined[train_size:])
print("total train_size: ", train_size, "; total valid_size: ", valid_size, "; total test_size: ", test_size)
print("執行時間：%f 秒" % (time.time() - start))

train test spliting
total train_size:  67872 ; total valid_size:  16968 ; total test_size:  348
執行時間：0.260351 秒


In [19]:
class LSTMModel(nn.Module):
    def __init__(self, hidden_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size=131, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 38)

    def forward(self, x):
        out, _ = self.lstm1(x)
        out = self.fc(out)
        return out


def train_model(model, train_X, train_y, train_seq_len,
                valid_X, valid_y, valid_seq_len, criterion, optimizer, epochs = 100, patience = 6):
    total_time = time.time()
    train_losses = []
    valid_losses = []
    best_valid_loss = float('inf')
    last_valid_loss = float('inf')
    patience_cnt = 0
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        batch_cnt = 0
        start = time.time()
        for inputs, labels in zip(train_X, train_y):
            class_id = []
            max_bid = 0
            for time_step in range(0, train_seq_len[batch_cnt]):
                cur_bid = np.argmax(labels[time_step])
                if ((cur_bid > 2) & (cur_bid < max_bid)):
                    train_seq_len[batch_cnt] = time_step+1
                    break
                class_id.append(cur_bid)
            labels = torch.tensor(class_id, dtype=torch.long).cuda()
            inputs = torch.tensor(inputs[:train_seq_len[batch_cnt]]
                                  , dtype=torch.float).cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            loss = loss.cpu()
            running_loss += abs(loss.item())
            batch_cnt = batch_cnt+1
            if batch_cnt % 10000 == 0:
                print(f'Epoch {epoch+1}/{epochs}, Batch {batch_cnt}/{len(train_X)}')
                print(f'Loss: {running_loss/batch_cnt}')
                print("執行時間：%f 秒" % (time.time() - start))
                start = time.time()

        start = time.time()
        model.eval()
        valid_loss = 0.0
        batch_cnt = 0
        with torch.no_grad():
            for inputs, labels in zip(valid_X, valid_y):
                inputs = torch.tensor(inputs[:valid_seq_len[batch_cnt]]
                                      , dtype=torch.float).cuda()
                class_id = []
                for time_step in range(0, valid_seq_len[batch_cnt]):
                    class_id.append(np.argmax(labels[time_step]))
                labels = torch.tensor(class_id, dtype=torch.long).cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss = loss.cpu()
                valid_loss += abs(loss.item())
                batch_cnt = batch_cnt+1
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_model_state = model.state_dict()
        print(f'Epoch {epoch+1}/{epochs}')
        print(f'Training Loss: {running_loss/len(train_X)}')
        print(f'Validation Loss: {valid_loss/len(valid_X)}')
        print("執行時間：%f 秒" % (time.time() - start))
        train_losses.append(running_loss/len(train_X))
        valid_losses.append(valid_loss/len(valid_X))
        if (last_valid_loss < valid_loss):
            patience_cnt = patience_cnt+1
        else:
            patience_cnt = 0
            last_valid_loss = valid_loss
        if (patience_cnt > patience):
            break
    return model, best_model_state, train_losses, valid_losses, time.time()-total_time

In [23]:
def test_decode(model, best_model_state, hidden_size, num_layers, dropout, file_prefix):
    print("testing dataset")
    start = time.time()
    test_loss = 0
    test_acc_true = 0
    test_acc_false = 0
    criterion = nn.CrossEntropyLoss()
    best_model = LSTMModel(hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
    best_model.load_state_dict(best_model_state)
    best_model.eval()
    with gzip.GzipFile(file_prefix+'_best_model.pgz', 'w') as f:
        pickle.dump(best_model, f)
    with gzip.GzipFile(file_prefix+'_final_model.pgz', 'w') as f:
        pickle.dump(model, f)
    results = []
    batch_cnt = 0
    for inputs, labels in zip(test_X, test_y):
        inputs = torch.tensor(inputs[:sequence_lengths[batch_cnt]], dtype=torch.float)
        class_id = []
        for time_step in range(0, sequence_lengths[batch_cnt]):
            class_id.append(np.argmax(labels[time_step]))
        labels = torch.tensor(class_id, dtype=torch.long)
        with torch.no_grad():
            predictions = best_model(inputs)
            loss = criterion(predictions, labels)
            test_loss += abs(loss.item())
            for _cnt in range(0, sequence_lengths[batch_cnt]):
                results.append(predictions[_cnt].tolist())
                if (np.argmax(predictions[_cnt]) == class_id[_cnt]):
                    test_acc_true = test_acc_true+1
                else:
                    test_acc_false = test_acc_false+1
        batch_cnt = batch_cnt+1
    test_loss = test_loss/batch_cnt
    test_acc = test_acc_true/(test_acc_true+test_acc_false)
    results_df = pd.DataFrame(results)
    excel_file = file_prefix+'_full.xlsx'
    results_df.to_excel(excel_file, index=False)
    print("執行時間：%f 秒" % (time.time() - start))
    print("decoding dataset")
    start = time.time()
    decoded_results = []
    batch_cnt = 0
    for index, row in results_df.iterrows():
        row_data = []
        row_data.append(decoded_label[batch_cnt])
        max_val = row[0]
        max_pos = 0
        for i in range(1, len(row)):
            if(row[i] > max_val):
                max_val = row[i]
                max_pos = i
        row_data.append(max_pos)
        decoded_results.append(row_data)
        batch_cnt = batch_cnt+1
    decoded_results_df = pd.DataFrame(decoded_results)
    excel_file = file_prefix+'_decoded.xlsx'
    decoded_results_df.to_excel(excel_file, index=False)
    print("執行時間：%f 秒" % (time.time() - start))
    return test_loss, test_acc

In [24]:
n_sample = 1
batch_size = int(train_size*0.1)
valid_size = int(valid_size*0.1)
opt = ['adam', 'amsgrad']
dropout = (0.1, 0.5)
alpha = (-5, -1)
hidden_size = (6, 12)
num_layers = (2, 10)

file_prefix = './093012'
column_names = ['optimizer', 'dropout', 'lr', 'hidden_size', 'num_layers', 'test_loss', 'test_acc', 'total_time']
additional_columns = [f"{i}.train_loss" for i in range(1, 101)] + [f"{i}.valid_loss" for i in range(1, 101)] + ['epoch']
column_names.extend(additional_columns)
print(f"with columns {column_names},")
para_df = pd.DataFrame(columns=column_names)
para_df.to_csv((file_prefix+"_hyperpara.csv"), index=False)
print(f"Empty CSV file {file_prefix}_hyperpara.csv has been created.")
for i in range(0, n_sample):
    paras = [None]*5
    paras[0] = opt[np.random.randint(0, 2)]
    paras[1] = np.random.uniform(dropout[0], dropout[1])
    paras[2] = 10 ** (np.random.randint(alpha[0], alpha[1]+1))
    paras[3] = 2 ** (np.random.randint(hidden_size[0], hidden_size[1]))
    paras[4] = np.random.randint(num_layers[0], num_layers[1])
    # asign specific para here
    model = LSTMModel(hidden_size = paras[3], num_layers = paras[4], dropout = paras[1]).cuda()
    print("current model--", i, "th exp: ", "hidden_size = ", paras[3], "num_layers = ", paras[4], "dropout = ", paras[1], "optimizer = ", paras[0], "lr = ", paras[2])
    criterion = nn.CrossEntropyLoss().cuda()
    if paras[0] == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=paras[2])
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=paras[2], amsgrad = True)
    model, best_model_state, train_losses, valid_losses, total_time = train_model(model, train_X[:batch_size], train_y[:batch_size], train_seq_len[:batch_size],
                                          valid_X[:valid_size], valid_y[:valid_size], valid_seq_len[:valid_size], criterion, optimizer, epochs = 5, patience = 3)
    test_loss, test_acc = test_decode(model, best_model_state, paras[3], paras[4], paras[1], file_prefix+"_"+str(i))
    row_data = {
        'optimizer': paras[0],
        'dropout': paras[1],
        'lr': paras[2],
        'hidden_size': paras[3],
        'num_layers': paras[4],
        'test_loss': test_loss,
        'test_acc' : test_acc,
        'total_time' : total_time
    }
    _epoch = 0
    for j in range(0, 100):
        if j < len(train_losses):
            row_data[f'{j+1}.train_loss'] = train_losses[j]
            _epoch = j+1
        else:
            row_data[f'{j+1}.train_loss'] = "nan"
    for k in range(0, 100):
        if k < len(valid_losses):
            row_data[f'{k+1}.valid_loss'] = valid_losses[k]
        else:
            row_data[f'{k+1}.valid_loss'] = "nan"
    row_data['epoch'] = _epoch
    para_df = pd.DataFrame([row_data])
    para_df.to_csv(file_prefix+"_hyperpara.csv", mode='a', header=False, index=False)

with columns ['optimizer', 'dropout', 'lr', 'hidden_size', 'num_layers', 'test_loss', 'test_acc', 'total_time', '1.train_loss', '2.train_loss', '3.train_loss', '4.train_loss', '5.train_loss', '6.train_loss', '7.train_loss', '8.train_loss', '9.train_loss', '10.train_loss', '11.train_loss', '12.train_loss', '13.train_loss', '14.train_loss', '15.train_loss', '16.train_loss', '17.train_loss', '18.train_loss', '19.train_loss', '20.train_loss', '21.train_loss', '22.train_loss', '23.train_loss', '24.train_loss', '25.train_loss', '26.train_loss', '27.train_loss', '28.train_loss', '29.train_loss', '30.train_loss', '31.train_loss', '32.train_loss', '33.train_loss', '34.train_loss', '35.train_loss', '36.train_loss', '37.train_loss', '38.train_loss', '39.train_loss', '40.train_loss', '41.train_loss', '42.train_loss', '43.train_loss', '44.train_loss', '45.train_loss', '46.train_loss', '47.train_loss', '48.train_loss', '49.train_loss', '50.train_loss', '51.train_loss', '52.train_loss', '53.train_los