## Construct attributes programmatically

#### Simple conjunctions/disjunctions

* Conjunctions and disjunctions are generated from all combinations of two binary atoms – $\{(p \land q), (p\land r), (q \land r), \dots\}$

  * 51 (binary atoms) choose 2 = 1275 additional atoms

* Combinations of literals with negations can also be considered – $\{(p \land q), (\lnot p \land q), (p \land\lnot q), (\lnot p \land\lnot q), \dots\}$.

  * 1275 * 4 = 5100 additional atoms

#### Filtering

* $\textbf{is1}(c_1) \land \textbf{is2}(c_1)$ is a contradiction – card 1 will never be both ranks 1 and 2
* $\textbf{isR}(c_1) \land \textbf{isY}(c_1)$ is a contradiction – card 1 will never be both red and yellow
* Discard attributes which provide little information (e.g. tautologies and contradictions) – [mutual information filtering](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif)

In [53]:
import pandas as pd

train = pd.read_csv('train.csv', sep='\t', true_values = ['T'], false_values= ['F'])
test  = pd.read_csv('test.csv',  sep='\t', true_values = ['T'], false_values= ['F'])

In [54]:
train.select_dtypes(include='bool').columns.shape[0]

165

In [4]:
from itertools import combinations

def combinationCols(df, level=2):
    return list(map(list, combinations(df.select_dtypes(include='bool').columns, 2)))

def filterBinaryCols(df):
    return df.loc[:, (df == True).any()].loc[:, (df == False).any()]

def conjunctionCols(df):
    return pd.DataFrame({'&'.join(cols) : df[cols].all(axis=1) for cols in combinationCols(df)})

def disjunctionCols(df):
    return pd.DataFrame({'|'.join(cols) : df[cols].any(axis=1) for cols in combinationCols(df)})

def conjunctionNegCols(df):
    comb = combinationCols(df)
    return   (pd.DataFrame({f'{p}&{q}'   : df[p] & df[q]   for p, q in comb})) \
        .join(pd.DataFrame({f'~{p}&{q}'  : ~df[p] & df[q]  for p, q in comb})) \
        .join(pd.DataFrame({f'{p}&~{q}'  : df[p] & ~df[q]  for p, q in comb})) \
        .join(pd.DataFrame({f'~{p}&~{q}' : ~df[p] & ~df[q] for p, q in comb}))

def disjunctionNegCols(df):
    comb = combinationCols(df)
    return   (pd.DataFrame({f'{p}|{q}'   : df[p] | df[q]   for p, q in comb})) \
        .join(pd.DataFrame({f'~{p}|{q}'  : ~df[p] | df[q]  for p, q in comb})) \
        .join(pd.DataFrame({f'{p}|~{q}'  : df[p] | ~df[q]  for p, q in comb})) \
        .join(pd.DataFrame({f'~{p}|~{q}' : ~df[p] | ~df[q] for p, q in comb}))

## Construct attributes from inferences

#### Human conventions

* Chop card – the oldest unclued card

#### Informative inferences

* Existing `ps...` (plausible) attributes only incorporate information from previously received clues, but agents can examine the discard pile and their partner's hand and consider the maximum copies of each card to further eliminate possibilities

* Agents can sometimes infer whether playing or discarding a card is rewarding or dangerous, if its rank is known and the firework stacks are considered – such rules involve many atomic attributes and are challenging for rule learners to produce

In [55]:
def chopCols(df):
    ch = pd.DataFrame({'Chop_c5'  : (df[['Col_c5',  'Rnk_c5' ]] == 'X').all(axis=1),
                       'Chop_c4'  : (df[['Col_c5',  'Rnk_c5' ]] != 'X').any(axis=1) & (df[['Col_c4',  'Rnk_c4' ]] == 'X').all(axis=1),
                       'Chop_c3'  : (df[['Col_c5',  'Rnk_c5' ]] != 'X').any(axis=1) & (df[['Col_c4',  'Rnk_c4' ]] != 'X').any(axis=1) & (df[['Col_c3',  'Rnk_c3' ]] == 'X').all(axis=1),
                       'Chop_c2'  : (df[['Col_c5',  'Rnk_c5' ]] != 'X').any(axis=1) & (df[['Col_c4',  'Rnk_c4' ]] != 'X').any(axis=1) & (df[['Col_c3',  'Rnk_c3' ]] != 'X').any(axis=1) & (df[['Col_c2',  'Rnk_c2' ]] == 'X').all(axis=1),
                       'Chop_c1'  : (df[['Col_c5',  'Rnk_c5' ]] != 'X').any(axis=1) & (df[['Col_c4',  'Rnk_c4' ]] != 'X').any(axis=1) & (df[['Col_c3',  'Rnk_c3' ]] != 'X').any(axis=1) & (df[['Col_c2',  'Rnk_c2' ]] != 'X').any(axis=1) & (df[['Col_c1',  'Rnk_c1' ]] == 'X').all(axis=1),
                       'KChop_c5' : (df[['KCol_c5', 'KRnk_c5']] == 'X').all(axis=1),
                       'KChop_c4' : (df[['KCol_c5', 'KRnk_c5']] != 'X').any(axis=1) & (df[['KCol_c4', 'KRnk_c4']] == 'X').all(axis=1),
                       'KChop_c3' : (df[['KCol_c5', 'KRnk_c5']] != 'X').any(axis=1) & (df[['KCol_c4', 'KRnk_c4']] != 'X').any(axis=1) & (df[['KCol_c3', 'KRnk_c3']] == 'X').all(axis=1),
                       'KChop_c2' : (df[['KCol_c5', 'KRnk_c5']] != 'X').any(axis=1) & (df[['KCol_c4', 'KRnk_c4']] != 'X').any(axis=1) & (df[['KCol_c3', 'KRnk_c3']] != 'X').any(axis=1) & (df[['KCol_c2', 'KRnk_c2']] == 'X').all(axis=1),
                       'KChop_c1' : (df[['KCol_c5', 'KRnk_c5']] != 'X').any(axis=1) & (df[['KCol_c4', 'KRnk_c4']] != 'X').any(axis=1) & (df[['KCol_c3', 'KRnk_c3']] != 'X').any(axis=1) & (df[['KCol_c2', 'KRnk_c2']] != 'X').any(axis=1) & (df[['KCol_c1', 'KRnk_c1']] == 'X').all(axis=1)})
    return ch.join(pd.DataFrame({f'is{x}_chop' : ch['KChop_c1'] & df[f'is{x}_c1'] |
                                                 ch['KChop_c2'] & df[f'is{x}_c2'] |
                                                 ch['KChop_c3'] & df[f'is{x}_c3'] |
                                                 ch['KChop_c4'] & df[f'is{x}_c4'] |
                                                 ch['KChop_c5'] & df[f'is{x}_c5'] for x in ('R','Y','G','W','B','1','2','3','4','5')}))

def chopAction(df):
    df.loc[(df['class'] == 'COLOR_R') & df['isR_chop'] |
           (df['class'] == 'COLOR_Y') & df['isY_chop'] |
           (df['class'] == 'COLOR_G') & df['isG_chop'] |
           (df['class'] == 'COLOR_W') & df['isW_chop'] |
           (df['class'] == 'COLOR_B') & df['isB_chop'], 'class'] = 'COLOR_CHOP'
    df.loc[(df['class'] == 'RANK_1') & df['is1_chop'] |
           (df['class'] == 'RANK_2') & df['is2_chop'] |
           (df['class'] == 'RANK_3') & df['is3_chop'] |
           (df['class'] == 'RANK_4') & df['is4_chop'] |
           (df['class'] == 'RANK_5') & df['is5_chop'], 'class'] = 'RANK_CHOP'
    df.loc[(df['class'] == 'PLAY_1') & df['Chop_c1'] |
           (df['class'] == 'PLAY_2') & df['Chop_c2'] |
           (df['class'] == 'PLAY_3') & df['Chop_c3'] |
           (df['class'] == 'PLAY_4') & df['Chop_c4'] |
           (df['class'] == 'PLAY_5') & df['Chop_c5'], 'class'] = 'PLAY_CHOP' # ignore?
    df.loc[(df['class'] == 'DISCARD_1') & df['Chop_c1'] |
           (df['class'] == 'DISCARD_2') & df['Chop_c2'] |
           (df['class'] == 'DISCARD_3') & df['Chop_c3'] |
           (df['class'] == 'DISCARD_4') & df['Chop_c4'] |
           (df['class'] == 'DISCARD_5') & df['Chop_c5'], 'class'] = 'DISCARD_CHOP'
    return df

def includeChop(df):
    return chopAction(appendCols(df, chopCols(df)))

In [59]:
import numpy as np

def deduceCols(df):
    cards = df.columns[df.columns.str.startswith('ds_')].str[3:]
    colours = cards.str[0].unique()
    ranks = cards.str[1].unique()
    hand = df.columns[df.columns.str.startswith('psR_')].str[4:] # 'c1', 'c2', ...
    # seen_R1 = ds_R1 + int(isR_c1 & is1_c1)
    #                 + int(isR_c2 & is1_c2) ...
    seen = pd.DataFrame({f'seen_{c}{r}' : df[f'ds_{c}{r}'] +
                         pd.DataFrame(zip(*[
                             (df[f'is{c}_{card}'] & df[f'is{r}_{card}']).astype(int)
                             for card in hand])).sum(axis=1)
                         for c, r in cards}) # deduce visible copies
    # card 1 can only be rank 1 if, for some plausible colour of card 1,
    # not all rank 1 cards of that colour has been seen (exists C. psC_c1 & seen_C1 < 3)
    # dps1_c1 = ps1_c1 & ( psR_c1 & seen_R1 < 3
    #                    | psY_c1 & seen_Y1 < 3)
    # dpsR_c1 = psR_c1 & ( ps1_c1 & seen_R1 < 3
    #                    | ps2_c1 & seen_R2 < 3 ...)
    copies = {'1': 3, '2': 2, '3': 2, '4': 2, '5': 1}
    dpsC = pd.DataFrame({f'dps{c}_{card}' : df[f'ps{c}_{card}'] &
                         pd.DataFrame(zip(*[
                             df[f'ps{r}_{card}'] & (seen[f'seen_{c}{r}'] < copies[r])
                             for r in ranks])).any(axis=1)
                         for card in hand
                         for c in colours}) # deduce plausible colours
    dpsR = pd.DataFrame({f'dps{r}_{card}' : df[f'ps{r}_{card}'] &
                         pd.DataFrame(zip(*[
                             df[f'ps{c}_{card}'] & (seen[f'seen_{c}{r}'] < copies[r])
                             for c in colours])).any(axis=1)
                         for card in hand
                         for r in ranks}) # deduce plausible ranks
    # dK1_c1 = ps1_c1 & ~(ps2_c1 | ps3_c1 | ps4_c1 | ps5_c1)
    # dRnk_c1 = 1*dK1_c1 + 2*dK2_c1 + 3*dK3_c1 + 4*dK4_c1 + 5*dK5_c1 
    dRnk = pd.DataFrame({f'dRnk_{card}' : pd.DataFrame(zip(*[
                             (dpsR[f'dps{r}_{card}'] &
                             ~pd.DataFrame(zip(*[
                                 dpsR[f'dps{r1}_{card}']
                                 for r1 in ranks if r1 != r])).any(axis=1)).map({True: int(r), False: np.nan})
                             for r in ranks])).fillna(method='bfill', axis=1).iloc[:, 0].astype('Int64')
                         for card in hand}) # deduce exact ranks
    dCol = pd.DataFrame({f'dCol_{card}' : pd.DataFrame(zip(*[
                             (dpsC[f'dps{c}_{card}'] &
                             ~pd.DataFrame(zip(*[
                                 dpsC[f'dps{c1}_{card}']
                                 for c1 in colours if c1 != c])).any(axis=1)).map({True: c, False: np.nan})
                             for c in colours])).fillna(method='bfill', axis=1).iloc[:, 0].fillna('X') # deduce exact colours
                         for card in hand}) # deduce exact colours
    # dRnk  = dRnk.join(pd.DataFrame({f'dRnk?_{card}' : dRnk[f'dRnk_{card}'].isna() for card in hand}))
    # a card is definitely safe to play if for all plausible colours C, card rank is fw_C+1
    sfP  = pd.DataFrame({f'sfP_{card}' : pd.DataFrame(zip(*[
                             ~dpsC[f'dps{c}_{card}'] | (dRnk[f'dRnk_{card}'] == df[f'fw_{c}']+1)
                             for c in colours])).all(axis=1)
                         for card in hand})
    # a card is definitely unsafe to play if for all plausible colours C, card rank is not fw_C+1 (lose life)
    usfP = pd.DataFrame({f'usfP_{card}' : pd.DataFrame(zip(*[
                             ~dpsC[f'dps{c}_{card}'] | (dRnk[f'dRnk_{card}'] != df[f'fw_{c}']+1) & dRnk[f'dRnk_{card}'].notna()
                             for c in colours])).all(axis=1)
                         for card in hand})
    # a card is definitely safe to discard if for all plausible colours C, card rank is < fw_C+1
    sfD  = pd.DataFrame({f'sfD_{card}' : pd.DataFrame(zip(*[
                             ~dpsC[f'dps{c}_{card}'] | (dRnk[f'dRnk_{card}'] < df[f'fw_{c}']+1)
                             for c in colours])).all(axis=1)
                         for card in hand})
    # a card is definitely unsafe to discard if for all plausible colours C, card rank is >= fw_C+1
    usfD = pd.DataFrame({f'usfD_{card}' : pd.DataFrame(zip(*[
                             ~dpsC[f'dps{c}_{card}'] | (dRnk[f'dRnk_{card}'] >= df[f'fw_{c}']+1)
                             for c in colours])).all(axis=1)
                         for card in hand})
    dRnk = dRnk.astype(object).fillna('X').astype(str) # all NaN inequalities complete
    
    dKpsC = pd.DataFrame({f'dKps{c}_{card}' : df[f'Kps{c}_{card}'] &
                         pd.DataFrame(zip(*[
                             df[f'Kps{r}_{card}'] & (df[f'ds_{c}{r}'] < copies[r])
                             for r in ranks])).any(axis=1)
                         for card in hand
                         for c in colours}) # deduce K plausible colours
    dKpsR = pd.DataFrame({f'dKps{r}_{card}' : df[f'Kps{r}_{card}'] &
                         pd.DataFrame(zip(*[
                             df[f'Kps{c}_{card}'] & (df[f'ds_{c}{r}'] < copies[r])
                             for c in colours])).any(axis=1)
                         for card in hand
                         for r in ranks}) # deduce K plausible ranks
    dKRnk = pd.DataFrame({f'dKRnk_{card}' : pd.DataFrame(zip(*[
                             (dKpsR[f'dKps{r}_{card}'] &
                             ~pd.DataFrame(zip(*[
                                 dKpsR[f'dKps{r1}_{card}']
                                 for r1 in ranks if r1 != r])).any(axis=1)).map({True: int(r), False: np.nan})
                             for r in ranks])).fillna(method='bfill', axis=1).iloc[:, 0].astype('Int64')
                         for card in hand}) # deduce exact ranks
    dKCol = pd.DataFrame({f'dKCol_{card}' : pd.DataFrame(zip(*[
                             (dKpsC[f'dKps{c}_{card}'] &
                             ~pd.DataFrame(zip(*[
                                 dKpsC[f'dKps{c1}_{card}']
                                 for c1 in colours if c1 != c])).any(axis=1)).map({True: c, False: np.nan})
                             for c in colours])).fillna(method='bfill', axis=1).iloc[:, 0].fillna('X') # deduce exact colours
                         for card in hand}) # deduce exact colours
    # a card is definitely safe to play if for all plausible colours C, card rank is fw_C+1
    KsfP  = pd.DataFrame({f'KsfP_{card}' : pd.DataFrame(zip(*[
                             ~dKpsC[f'dKps{c}_{card}'] | (dKRnk[f'dKRnk_{card}'] == df[f'fw_{c}']+1)
                             for c in colours])).all(axis=1)
                         for card in hand})
    # a card is definitely unsafe to play if for all plausible colours C, card rank is not fw_C+1 (lose life)
    KusfP = pd.DataFrame({f'KusfP_{card}' : pd.DataFrame(zip(*[
                             ~dKpsC[f'dKps{c}_{card}'] | (dKRnk[f'dKRnk_{card}'] != df[f'fw_{c}']+1) & dKRnk[f'dKRnk_{card}'].notna()
                             for c in colours])).all(axis=1)
                         for card in hand})
    # a card is definitely safe to discard if for all plausible colours C, card rank is < fw_C+1
    KsfD  = pd.DataFrame({f'KsfD_{card}' : pd.DataFrame(zip(*[
                             ~dKpsC[f'dKps{c}_{card}'] | (dKRnk[f'dKRnk_{card}'] < df[f'fw_{c}']+1)
                             for c in colours])).all(axis=1)
                         for card in hand})
    # a card is definitely unsafe to discard if for all plausible colours C, card rank is >= fw_C+1
    KusfD = pd.DataFrame({f'KusfD_{card}' : pd.DataFrame(zip(*[
                             ~dKpsC[f'dKps{c}_{card}'] | (dKRnk[f'dKRnk_{card}'] >= df[f'fw_{c}']+1)
                             for c in colours])).all(axis=1)
                         for card in hand})
    dKRnk = dKRnk.astype(object).fillna('X').astype(str) # all NaN inequalities complete
    
    df = dpsC.join(dpsR).join(dCol).join(dRnk).join(sfP).join(usfP).join(sfD).join(usfD)
    df = df.join(dKpsC).join(dKpsR).join(dKCol).join(dKRnk).join(KsfP).join(KusfP).join(KsfD).join(KusfD)
    
    # remove 'd' prefix
    return df.rename(columns={col : col[1:] for col in df.columns[df.columns.str.startswith('d')]})

def appendDeduceCols(df):
    # remove old KR/KY cols
    return appendCols(df.drop(columns=df.columns[df.columns.str.match('K._')]), deduceCols(df))

def dropBelief(df):
    # remove K* cols
    return df.drop(columns=df.columns[df.columns.str.match('K.')])

## Transform datasets for BioHEL

* Datasets exported to the BioHEL-compatibile ARFF file format

In [20]:
import csv
import numpy as np

def exportDataset(df, filename, chunk_size=500):
    type_map = {
        np.dtype(bool): '{0,1}',
        np.dtype(np.float64): 'NUMERIC',
        np.dtype(np.int64): 'NUMERIC',
        pd.Int64Dtype(): 'NUMERIC'
    }
    with open('%s.txt' % filename, 'w') as f:
        f.write('@relation %s\n' % filename)
        # write attribute data
        for attribute in df.columns:
            dtype = df[attribute].dtype
            if dtype in type_map:
                atype = type_map[dtype]
            else:
                atype = '{%s}' % ','.join(sorted(df[attribute].unique()))
            f.write('@attribute %s %s\n' % (attribute, atype))
        # write dataset in chunks
        f.write('@data\n')
        chunks = [df[i:i+chunk_size] for i in range(0, df.shape[0], chunk_size)]
        for chunk in chunks:
            (chunk*1).to_csv(f, header=False, index=False, mode='a', na_rep='?')

def appendCols(df, cols):
    old_cols = df.columns.tolist()
    add_cols = cols.columns.difference(df.columns).tolist()
    print('overwriting columns:', cols.columns.intersection(df.columns).tolist())
    df = df.reindex(columns=old_cols[:-1] + add_cols + old_cols[-1:])
    df.loc[cols.index, cols.columns] = cols
    return df

In [57]:
%cd /notebooks/BioHEL-demo/BioHEL-cuda/Datasets

/notebooks/BioHEL-demo/BioHEL-cuda/Datasets


In [None]:
# Belief
exportDataset(test,  'belief_test')
exportDataset(train, 'belief_train')

In [9]:
# Base
exportDataset(dropBelief(dropCritical(test)),  'test')
exportDataset(dropBelief(dropCritical(train)), 'train')

In [19]:
# Belief + deductions
exportDataset(appendDeduceCols(test),  'belief_deduce_test')
exportDataset(appendDeduceCols(train), 'belief_deduce_train')

overwriting columns: ['psR_c1', 'psY_c1', 'psG_c1', 'psW_c1', 'psB_c1', 'psR_c2', 'psY_c2', 'psG_c2', 'psW_c2', 'psB_c2', 'psR_c3', 'psY_c3', 'psG_c3', 'psW_c3', 'psB_c3', 'psR_c4', 'psY_c4', 'psG_c4', 'psW_c4', 'psB_c4', 'psR_c5', 'psY_c5', 'psG_c5', 'psW_c5', 'psB_c5', 'ps1_c1', 'ps2_c1', 'ps3_c1', 'ps4_c1', 'ps5_c1', 'ps1_c2', 'ps2_c2', 'ps3_c2', 'ps4_c2', 'ps5_c2', 'ps1_c3', 'ps2_c3', 'ps3_c3', 'ps4_c3', 'ps5_c3', 'ps1_c4', 'ps2_c4', 'ps3_c4', 'ps4_c4', 'ps5_c4', 'ps1_c5', 'ps2_c5', 'ps3_c5', 'ps4_c5', 'ps5_c5', 'Col_c1', 'Col_c2', 'Col_c3', 'Col_c4', 'Col_c5', 'Rnk_c1', 'Rnk_c2', 'Rnk_c3', 'Rnk_c4', 'Rnk_c5', 'KpsR_c1', 'KpsY_c1', 'KpsG_c1', 'KpsW_c1', 'KpsB_c1', 'KpsR_c2', 'KpsY_c2', 'KpsG_c2', 'KpsW_c2', 'KpsB_c2', 'KpsR_c3', 'KpsY_c3', 'KpsG_c3', 'KpsW_c3', 'KpsB_c3', 'KpsR_c4', 'KpsY_c4', 'KpsG_c4', 'KpsW_c4', 'KpsB_c4', 'KpsR_c5', 'KpsY_c5', 'KpsG_c5', 'KpsW_c5', 'KpsB_c5', 'Kps1_c1', 'Kps2_c1', 'Kps3_c1', 'Kps4_c1', 'Kps5_c1', 'Kps1_c2', 'Kps2_c2', 'Kps3_c2', 'Kps4_c2', 'Kps

In [11]:
# Base + deductions
exportDataset(appendDeduceCols(dropBelief(dropCritical(test))),  'deduce_test')
exportDataset(appendDeduceCols(dropBelief(dropCritical(train))), 'deduce_train')

overwriting columns: ['psR_c1', 'psY_c1', 'psG_c1', 'psW_c1', 'psB_c1', 'psR_c2', 'psY_c2', 'psG_c2', 'psW_c2', 'psB_c2', 'psR_c3', 'psY_c3', 'psG_c3', 'psW_c3', 'psB_c3', 'psR_c4', 'psY_c4', 'psG_c4', 'psW_c4', 'psB_c4', 'psR_c5', 'psY_c5', 'psG_c5', 'psW_c5', 'psB_c5', 'ps1_c1', 'ps2_c1', 'ps3_c1', 'ps4_c1', 'ps5_c1', 'ps1_c2', 'ps2_c2', 'ps3_c2', 'ps4_c2', 'ps5_c2', 'ps1_c3', 'ps2_c3', 'ps3_c3', 'ps4_c3', 'ps5_c3', 'ps1_c4', 'ps2_c4', 'ps3_c4', 'ps4_c4', 'ps5_c4', 'ps1_c5', 'ps2_c5', 'ps3_c5', 'ps4_c5', 'ps5_c5', 'Col_c1', 'Col_c2', 'Col_c3', 'Col_c4', 'Col_c5', 'Rnk_c1', 'Rnk_c2', 'Rnk_c3', 'Rnk_c4', 'Rnk_c5']
overwriting columns: ['psR_c1', 'psY_c1', 'psG_c1', 'psW_c1', 'psB_c1', 'psR_c2', 'psY_c2', 'psG_c2', 'psW_c2', 'psB_c2', 'psR_c3', 'psY_c3', 'psG_c3', 'psW_c3', 'psB_c3', 'psR_c4', 'psY_c4', 'psG_c4', 'psW_c4', 'psB_c4', 'psR_c5', 'psY_c5', 'psG_c5', 'psW_c5', 'psB_c5', 'ps1_c1', 'ps2_c1', 'ps3_c1', 'ps4_c1', 'ps5_c1', 'ps1_c2', 'ps2_c2', 'ps3_c2', 'ps4_c2', 'ps5_c2', 'ps1_c

In [61]:
# Belief + chop
exportDataset(includeChop(test),  'belief_chop_test')
exportDataset(includeChop(train), 'belief_chop_train')

overwriting columns: []
overwriting columns: []


In [60]:
# Belief + chop + deductions
exportDataset(appendDeduceCols(includeChop(test)),  'belief_deduce_chop_test')
exportDataset(appendDeduceCols(includeChop(train)), 'belief_deduce_chop_train')

overwriting columns: []
overwriting columns: ['psR_c1', 'psY_c1', 'psG_c1', 'psW_c1', 'psB_c1', 'psR_c2', 'psY_c2', 'psG_c2', 'psW_c2', 'psB_c2', 'psR_c3', 'psY_c3', 'psG_c3', 'psW_c3', 'psB_c3', 'psR_c4', 'psY_c4', 'psG_c4', 'psW_c4', 'psB_c4', 'psR_c5', 'psY_c5', 'psG_c5', 'psW_c5', 'psB_c5', 'ps1_c1', 'ps2_c1', 'ps3_c1', 'ps4_c1', 'ps5_c1', 'ps1_c2', 'ps2_c2', 'ps3_c2', 'ps4_c2', 'ps5_c2', 'ps1_c3', 'ps2_c3', 'ps3_c3', 'ps4_c3', 'ps5_c3', 'ps1_c4', 'ps2_c4', 'ps3_c4', 'ps4_c4', 'ps5_c4', 'ps1_c5', 'ps2_c5', 'ps3_c5', 'ps4_c5', 'ps5_c5', 'Col_c1', 'Col_c2', 'Col_c3', 'Col_c4', 'Col_c5', 'Rnk_c1', 'Rnk_c2', 'Rnk_c3', 'Rnk_c4', 'Rnk_c5', 'KpsR_c1', 'KpsY_c1', 'KpsG_c1', 'KpsW_c1', 'KpsB_c1', 'KpsR_c2', 'KpsY_c2', 'KpsG_c2', 'KpsW_c2', 'KpsB_c2', 'KpsR_c3', 'KpsY_c3', 'KpsG_c3', 'KpsW_c3', 'KpsB_c3', 'KpsR_c4', 'KpsY_c4', 'KpsG_c4', 'KpsW_c4', 'KpsB_c4', 'KpsR_c5', 'KpsY_c5', 'KpsG_c5', 'KpsW_c5', 'KpsB_c5', 'Kps1_c1', 'Kps2_c1', 'Kps3_c1', 'Kps4_c1', 'Kps5_c1', 'Kps1_c2', 'Kps2_c2', 'K

## Export datasets after filtering

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

def selectBest(df, k=55, preserve=0):
    print('selecting')
    selector = SelectKBest(mutual_info_classif, k=k)
    print('fitting')
    selector.fit(df.iloc[:, preserve:-1], df.iloc[:, -1])
    print('done fitting')
    cols = selector.get_support(indices=True)
    return df.iloc[:, :preserve].join(df.iloc[:, cols]).join(df.iloc[:, -1])

In [None]:
# train_new = selectBest(appendCols(train, disjunctionCols(train)))
train_new = selectBest(appendCols(train, disjunctionCols(train)), 45, 55)

selecting
fitting
done fitting


In [None]:
print(len(train_new.columns))
print([c for c in train_new.columns])

101
['isR_c1', 'isY_c1', 'is1_c1', 'is2_c1', 'is3_c1', 'is4_c1', 'is5_c1', 'isR_c2', 'isY_c2', 'is1_c2', 'is2_c2', 'is3_c2', 'is4_c2', 'is5_c2', 'psR_c1', 'psY_c1', 'ps1_c1', 'ps2_c1', 'ps3_c1', 'ps4_c1', 'ps5_c1', 'psR_c2', 'psY_c2', 'ps1_c2', 'ps2_c2', 'ps3_c2', 'ps4_c2', 'ps5_c2', 'KR_c1', 'KY_c1', 'K1_c1', 'K2_c1', 'K3_c1', 'K4_c1', 'K5_c1', 'KR_c2', 'KY_c2', 'K1_c2', 'K2_c2', 'K3_c2', 'K4_c2', 'K5_c2', 'KKC_c1', 'KKR_c1', 'KKC_c2', 'KKR_c2', 'fw_R', 'fw_Y', 'COLOR', 'RANK', 'DISCARD', 'PLAY', 'NONE', 'life', 'info', 'isY_c1|is1_c1', 'isY_c1|ps5_c1', 'isY_c1|NONE', 'is3_c1|ps2_c1', 'is4_c1|ps1_c1', 'isY_c2|K4_c1', 'is1_c2|is4_c2', 'is2_c2|is3_c2', 'is3_c2|RANK', 'ps1_c2|RANK', 'ps2_c2|KR_c1', 'ps2_c2|KY_c1', 'ps2_c2|K1_c1', 'ps2_c2|K5_c1', 'ps2_c2|KR_c2', 'ps2_c2|K4_c2', 'ps2_c2|KKC_c1', 'ps2_c2|KKR_c1', 'ps2_c2|KKC_c2', 'ps2_c2|KKR_c2', 'ps2_c2|COLOR', 'ps2_c2|RANK', 'ps2_c2|DISCARD', 'ps2_c2|PLAY', 'ps2_c2|NONE', 'ps3_c2|ps4_c2', 'ps3_c2|ps5_c2', 'ps3_c2|KR_c1', 'ps3_c2|K3_c1', '

In [None]:
test_new  = appendCols(test,  disjunctionCols(test))[train_new.columns]

In [None]:
exportDataset(test_new,  'small_disj_mi_100_test')
exportDataset(train_new, 'small_disj_mi_100_train')