In [1]:
import re
import pandas as pd
import math
import csv
from label_encoding import encode_label_onehot

In [2]:
def hand_decode(hand):
    hand = hand.replace('\n', '').replace(' ', '').replace('\t', '')
    matches = re.findall(r'[s, h, d, c]([\d, a, k, q, j, t]*)', hand)
    onehot_list = [0] * 52
    i = 0
    for match in matches:
        if 'a' in match:
            onehot_list[i+0] = 1
        if 'k' in match:
            onehot_list[i+1] = 1
        if 'q' in match:
            onehot_list[i+2] = 1
        if 'j' in match:
            onehot_list[i+3] = 1
        if 't' in match:
            onehot_list[i+4] = 1
        if '9' in match:
            onehot_list[i+5] = 1
        if '8' in match:
            onehot_list[i+6] = 1
        if '7' in match:
            onehot_list[i+7] = 1
        if '6' in match:
            onehot_list[i+8] = 1
        if '5' in match:
            onehot_list[i+9] = 1
        if '4' in match:
            onehot_list[i+10] = 1
        if '3' in match:
            onehot_list[i+11] = 1
        if '2' in match:
            onehot_list[i+12] = 1
        i = i+13
    return onehot_list

In [3]:
def abstract_feature(onehot_list):
    extended_onehot_list = [0] * 62
    card_len = [0] * 4
    HCP = [0] * 5
    for i in range(0, 52):
        extended_onehot_list[i] = onehot_list[i]
        if extended_onehot_list[i]:
            card_len[math.floor(i/13)] = card_len[math.floor(i/13)] + 1
            if (i%13) < 4:
                HCP[math.floor(i/13)] = HCP[math.floor(i/13)] + 4 - (i%13)
                HCP[4] = HCP[4] + 4 - (i%13)
    extended_onehot_list[52:57] = HCP
    extended_onehot_list[57:61] = card_len
    extended_onehot_list[61] = 1
    for i in range(0, 4):
        if card_len[i] <= 1:
            extended_onehot_list[61] = 0
    return extended_onehot_list

In [4]:
def clean_seq(originSequence, hand_onehot, dealer):
    pattern = r'^.*?mb\|p\|mb\|p\|mb\|p'
    match = re.search(pattern, originSequence.lower().replace('\n', '').replace(' ', '').replace('\t', ''))
    bidding_sequence = match.group()
    if (bidding_sequence == ""):
        return "invalid", bidding_sequence
    matches = re.findall('(p|d|r|\dc|\dd|\dh|\ds|\dn)', bidding_sequence)
    matches_list = list(matches)
    cnt = dealer
    for bid in matches_list:
        if (bid != 'p'):
            break
        cnt = cnt+1
    first_bid = matches_list[cnt-dealer]
    first_bidder = cnt % 4
    bidder_hand = hand_onehot[cnt % 4]
    #print("first bidder: ", first_bidder)
    #print("bid: ", first_bid)
    #print("hand: ", bidder_hand)
    if (first_bid == "1c"):
        if (bidder_hand[56] < 10 or bidder_hand[57] < 2):
            return "art", bidding_sequence
    if (first_bid == "1d"):
        if (bidder_hand[56] < 10 or bidder_hand[58] < 4):
            return "art", bidding_sequence
    if (first_bid == "1h"):
        if (bidder_hand[56] < 10 or bidder_hand[59] < 5):
            return "art", bidding_sequence
    if (first_bid == "1s"):
        if (bidder_hand[56] < 10 or bidder_hand[60] < 5):
            return "art", bidding_sequence
    if (first_bid == "1n"):
        if (bidder_hand[56] < 14 or bidder_hand[61] == 0):
            return "art", bidding_sequence
    if (first_bid == "2h"):
        if (bidder_hand[59] < 5):
            return "art", bidding_sequence
    if (first_bid == "2s"):
        if (bidder_hand[60] < 5):
            return "art", bidding_sequence
    if (first_bid == "2n"):
        if (bidder_hand[56] < 19 or bidder_hand[61] == 0):
            return "art", bidding_sequence
    pattern = r'\|an\|'
    match = re.search(pattern, bidding_sequence)
    if match:
        return "nat_art", bidding_sequence
    else:
        return "nat", bidding_sequence
    return "invalid", bidding_sequence

In [5]:
def preprocess(file, target, init: int):
    sample = int(init or 0)
    with open(file, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        skip = 1
        for row in reader:
            if skip == 1:
                skip = 0
                continue
            #print(row)
            if row['BiddingSequence'] != '':
                dealer = -1
                if row['DEALER'] == '3':
                    dealer = 0
                elif row['DEALER'] == '4':
                    dealer = 1
                elif row['DEALER'] == '1':
                    dealer = 2
                elif row['DEALER'] == '2':
                    dealer = 3
                hand_onehot = [''] * 4
                hand_onehot[0] = abstract_feature(hand_decode(row['N']))
                hand_onehot[1] = abstract_feature(hand_decode(row['E']))
                hand_onehot[2] = abstract_feature(hand_decode(row['S']))
                hand_onehot[3] = abstract_feature(hand_decode(row['W']))
                #(N, E, S, W) = (0, 1, 2, 3)
                style, bidding_sequence = clean_seq(row['BiddingSequence'], hand_onehot, dealer)
                if (style == "invalid" or bidding_sequence == ""):
                    continue
                matches = re.findall('(p|d|r|\dc|\dd|\dh|\ds|\dn)', bidding_sequence)
                matches_list = ['n', 'n', 'n']+list(matches)
                for player in range(0, 4):
                    hand = hand_onehot[player]
                    vul = [0] * 4
                    if(row['VUL'] == 'b'):
                        vul[1] = 1
                    elif(row['VUL'] == '0'):
                        vul[2] = 1
                    elif(row['VUL'] == 'n'):
                        if(player%2 == 0):
                            vul[0] = 1
                        else:
                            vul[3] = 1
                    elif(row['VUL'] == 'e'):
                        if(player%2 == 0):
                            vul[3] = 1
                        else:
                            vul[0] = 1
                    cnt = 0
                    bid_offset =  (player - dealer + 4)%4
                    encoded_bids = encode_label_onehot(matches_list)
                    #print(matches_list)
                    #print(encoded_bids)
                    while cnt*4 + bid_offset + 3 < len(matches_list):
                        row_data = [''] * 229
                        tmp = ['', '']
                        if dealer == 0:
                            tmp[0] = 'N-'
                        elif dealer == 1:
                            tmp[0] = 'E-'
                        elif dealer == 2:
                            tmp[0] = 'S-'
                        elif dealer == 3:
                            tmp[0] = 'W-'
                        if player == 0:
                            tmp[1] = 'N-'
                            hand_string = row['N']
                        elif player == 1:
                            tmp[1] = 'E-'
                            hand_string = row['E']
                        elif player == 2:
                            tmp[1] = 'S-'
                            hand_string = row['S']
                        elif player == 3:
                            tmp[1] = 'W-'
                            hand_string = row['W']
                        row_data[0] = tmp[0]+str(dealer)
                        #print(tmp[0]+str(dealer))
                        row_data[1] = tmp[1]+str(player)
                        row_data[2] = bidding_sequence
                        row_data[3] = matches_list[cnt*4 + bid_offset]+matches_list[cnt*4 + bid_offset+1]+matches_list[cnt*4 + bid_offset+2]+matches_list[cnt*4 + bid_offset+3]
                        row_data[4] = hand_string
                        row_data[5:66] = hand
                        row_data[67:70] = vul
                        row_data[71:109] = encoded_bids[cnt*4 + bid_offset]
                        row_data[110:148] = encoded_bids[cnt*4 + bid_offset+1]
                        row_data[149:187] = encoded_bids[cnt*4 + bid_offset+2]
                        row_data[188:226] = encoded_bids[cnt*4 + bid_offset+3]
                        row_data[227] = sample
                        row_data[228] = style
                        cnt = cnt+1
                        #print(row_data)
                        with open(target, 'a', newline='') as target_csvfile:
                                writer = csv.writer(target_csvfile)
                                writer.writerow(row_data)
                      
                    sample = sample+1
    
            else:
                raise ValueError("Empty bidding sequences.")
    return sample


In [6]:
column_names = ['dealer_df', 'dlayer_df', 'total_seq_df', 'current_seq_df', 'hand_df', 'SA_ff', 'SK_ff', 'SQ_ff', 'SJ_ff', 'ST_ff', 'S9_ff', 'S8_ff', 'S7_ff', 'S6_ff', 'S5_ff', 'S4_ff', 'S3_ff', 'S2_ff', 'HA_ff', 'HK_ff', 'HQ_ff', 'HJ_ff', 'HT_ff', 'H9_ff', 'H8_ff', 'H7_ff', 'H6_ff', 'H5_ff', 'H4_ff', 'H3_ff', 'H2_ff', 'DA_ff', 'DK_ff', 'DQ_ff', 'DJ_ff', 'DT_ff', 'D9_ff', 'D8_ff', 'D7_ff', 'D6_ff', 'D5_ff', 'D4_ff', 'D3_ff', 'D2_ff', 'CA_ff', 'CK_ff', 'CQ_ff', 'CJ_ff', 'CT_ff', 'C9_ff', 'C8_ff', 'C7_ff', 'C6_ff', 'C5_ff', 'C4_ff', 'C3_ff', 'C2_ff','S_HCP_ff','H_HCP_ff','D_HCP_ff','C_HCP_ff','total_HCP_ff','S_len_ff','H_len_ff','D_len_ff','C_len_ff','balanced_ff', 'vul_self_ff', 'vul_both_ff', 'vul_none_ff', 'vul_opp_ff', '1st_none_ff', '1st_p_ff', '1st_d_ff', '1st_r_ff', '1st_1c_ff', '1st_1d_ff', '1st_1h_ff', '1st_1s_ff', '1st_1n_ff', '1st_2c_ff', '1st_12_ff', '1st_2h_ff', '1st_2s_ff', '1st_2n_ff', '1st_3c_ff', '1st_3d_ff', '1st_3h_ff', '1st_3s_ff', '1st_3n_ff', '1st_4c_ff', '1st_4d_ff', '1st_4h_ff', '1st_4s_ff', '1st_4n_ff', '1st_5c_ff', '1st_5d_ff', '1st_5h_ff', '1st_5s_ff', '1st_5n_ff', '1st_6c_ff', '1st_6d_ff', '1st_6h_ff', '1st_6s_ff', '1st_6n_ff', '1st_7c_ff', '1st_7d_ff', '1st_7h_ff', '1st71s_ff', '1st_7n_ff', '2nd_none_ff', '2nd_p_ff', '2nd_d_ff', '2nd_r_ff', '2nd_1c_ff', '2nd_1d_ff', '2nd_1h_ff', '2nd_1s_ff', '2nd_1n_ff', '2nd_2c_ff', '2nd_2d_ff', '2nd_2h_ff', '2nd_2s_ff', '2nd_2n_ff', '2nd_3c_ff', '2nd_3d_ff', '2nd_3h_ff', '2nd_3s_ff', '2nd_3n_ff', '2nd_4c_ff', '2nd_4d_ff', '2nd_4h_ff', '2nd_4s_ff', '2nd_4n_ff', '2nd_5c_ff', '2nd_5d_ff', '2nd_5h_ff', '2nd_5s_ff', '2nd_5n_ff', '2nd_6c_ff', '2nd_6d_ff', '2nd_6h_ff', '2nd_6s_ff', '2nd_6n_ff', '2nd_7c_ff', '2nd_7d_ff', '2nd_7h_ff', '2nd_7s_ff', '2nd_7n_ff', '3rd_none_ff', '3rd_p_ff', '3rd_d_ff', '3rd_r_ff', '3rd_1c_ff', '3rd_1d_ff', '3rd_1h_ff', '3rd_1s_ff', '3rd_1n_ff', '3rd_2c_ff', '3rd_2d_ff', '3rd_2h_ff', '3rd_2s_ff', '3rd21n_ff', '3rd_3c_ff', '3rd_3d_ff', '3rd_3h_ff', '3rd_3s_ff', '3rd_3n_ff', '3rd_4c_ff', '3rd_4d_ff', '3rd_4h_ff', '3rd_4s_ff', '3rd_4n_ff', '3rd_5c_ff', '3rd_5d_ff', '3rd_5h_ff', '3rd_5s_ff', '3rd_5n_ff', '3rd_6c_ff', '3rd_6d_ff', '3rd_6h_ff', '3rd_6s_ff', '3rd_6n_ff', '3rd_7c_ff', '3rd_7d_ff', '3rd_7h_ff', '3rd_7s_ff', '3rd_7n_ff','none_lf', 'p_lf', 'd_lf', 'r_lf', '1c_lf', '1d_lf', '1h_lf', '1s_lf', '1n_lf', '2c_lf', '2d_lf', '2h_lf', '2s_lf', '2n_lf', '3c_lf', '3d_lf', '3h_lf', '3s_lf', '3n_lf', '4c_lf', '4d_lf', '4h_lf', '4s_lf', '4n_lf', '5c_lf', '5d_lf', '5h_lf', '5s_lf', '5n_lf', '6c_lf', '6d_lf', '6h_lf', '6s_lf', '6n_lf', '7c_lf', '7d_lf', '7h_lf', '7s_lf', '7n_lf', 'index', 'style']
print(f"with columns {column_names},")
for i in range(0, 10):
    df = pd.DataFrame(columns=column_names)
    file_path = './牌局/collection_onehot_handfeature_cleaned_'
    output_file_path = (file_path+str(i)+".csv")
    df.to_csv(output_file_path, index=False)
    print(f"Empty CSV file {output_file_path} has been created.")
    sample = 0
    for id in range(1+i*1000, 1001+i*1000):
        file_name = f"./牌局/{id}.csv"
        try:
            sample = preprocess(file_name, output_file_path, sample)
            print(file_name)
        except Exception as e:
            with open('error_log_preprocess_handfeature_cleaned_0823.txt', 'a') as f:
                f.write(str(id))
                f.write(f": An error occurred during preprocessing: {e}\n")
            print(f"An error occurred: {e}")
            continue

with columns ['dealer_df', 'dlayer_df', 'total_seq_df', 'current_seq_df', 'hand_df', 'SA_ff', 'SK_ff', 'SQ_ff', 'SJ_ff', 'ST_ff', 'S9_ff', 'S8_ff', 'S7_ff', 'S6_ff', 'S5_ff', 'S4_ff', 'S3_ff', 'S2_ff', 'HA_ff', 'HK_ff', 'HQ_ff', 'HJ_ff', 'HT_ff', 'H9_ff', 'H8_ff', 'H7_ff', 'H6_ff', 'H5_ff', 'H4_ff', 'H3_ff', 'H2_ff', 'DA_ff', 'DK_ff', 'DQ_ff', 'DJ_ff', 'DT_ff', 'D9_ff', 'D8_ff', 'D7_ff', 'D6_ff', 'D5_ff', 'D4_ff', 'D3_ff', 'D2_ff', 'CA_ff', 'CK_ff', 'CQ_ff', 'CJ_ff', 'CT_ff', 'C9_ff', 'C8_ff', 'C7_ff', 'C6_ff', 'C5_ff', 'C4_ff', 'C3_ff', 'C2_ff', 'S_HCP_ff', 'H_HCP_ff', 'D_HCP_ff', 'C_HCP_ff', 'total_HCP_ff', 'S_len_ff', 'H_len_ff', 'D_len_ff', 'C_len_ff', 'balanced_ff', 'vul_self_ff', 'vul_both_ff', 'vul_none_ff', 'vul_opp_ff', '1st_none_ff', '1st_p_ff', '1st_d_ff', '1st_r_ff', '1st_1c_ff', '1st_1d_ff', '1st_1h_ff', '1st_1s_ff', '1st_1n_ff', '1st_2c_ff', '1st_12_ff', '1st_2h_ff', '1st_2s_ff', '1st_2n_ff', '1st_3c_ff', '1st_3d_ff', '1st_3h_ff', '1st_3s_ff', '1st_3n_ff', '1st_4c_ff', '1