In [8]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import sys
import re
import pickle
import itertools
import json


from rulekit.classification import RuleClassifier
from rulekit.params import Measures

from rulekit.survival import SurvivalRules

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_recall_curve, roc_curve, auc
from sklearn.metrics import make_scorer, roc_auc_score

sys.path.append('./../src/')
from utils import *

_ = RuleClassifier()

In [5]:
data_df = pd.read_csv('./../Data/1000_features_survival_3classes.csv',
                      index_col=0).drop(['index'],axis=1)
data_df['y'].value_counts()

y
0    921
1    246
2     42
Name: count, dtype: int64

# Best configs

In [2]:
measure = Measures.RSS
minsupp_new = 3.0

In [12]:
# Dataset
nclasses = 3

# Initialize the LabelEncoder
le = LabelEncoder()

# Loop through each column and apply encoding to object type columns
for col in data_df.columns:
    if data_df[col].dtype == 'object':
        data_df[col] = le.fit_transform(data_df[col])

data_df = data_df.fillna(data_df.mean())

X = data_df.drop(['event', 'time','y'], axis=1)
y = data_df['y']


# split data into train and test sets
seeds = [999, 7, 42, 1995, 1303, 2405, 1996, 200, 0, 777]
test_size = 0.3

In [4]:
result = {'seed': seeds,
          'measure': ['rss']*len(seeds),
          'minsupp_new': [3]*len(seeds),
          'time': [None]*len(seeds),
          'accuracy_train': [None]*len(seeds),
          'MCC_train': [None]*len(seeds),
          'f1_train': [None]*len(seeds), 
          'f1_train': [None]*len(seeds), 
          'auroc_train': [None]*len(seeds),
          'auprc_train': [None]*len(seeds),
          'accuracy_val': [None]*len(seeds),
          'MCC_val': [None]*len(seeds),
          'f1_val': [None]*len(seeds), 
          'auroc_val': [None]*len(seeds),
          'auprc_val': [None]*len(seeds),
          'accuracy_test': [None]*len(seeds),
          'MCC_test': [None]*len(seeds),
          'f1_test': [None]*len(seeds), 
          'auroc_test': [None]*len(seeds),
          'auprc_test': [None]*len(seeds),
          'nrules': [None]*len(seeds),
          'rules_count': [None]*len(seeds),
          'conditions_per_rule': [None]*len(seeds),
          'induced_conditions_per_rule': [None]*len(seeds),
          'avg_rule_coverage': [None]*len(seeds),
          'avg_rule_precision': [None]*len(seeds),
          'avg_rule_quality': [None]*len(seeds),
          'pvalue': [None]*len(seeds)}

for idx, seed in enumerate(seeds):
    # split data into train and test sets
    X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=test_size, stratify=y, random_state=seed)
    X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=seed)

    
    classifier = RuleClassifier(induction_measure=measure,
                                pruning_measure=measure,
                                voting_measure=measure,
                                minsupp_new=minsupp_new)
    
    start = time.time()
    classifier.fit(X_train, y_train)
    end = time.time()
    elapsed_time = end-start
    print("Time for training: ", elapsed_time)
    result['time'][idx] = [elapsed_time]
    ruleset = classifier.model
    
    mapper_dict = {class_:idx for idx, class_ in enumerate(classifier.label_unique_values)}

    # make predictions for train data
    y_train_pred = classifier.predict(X_train)
    y_train_proba = classifier.predict_proba(X_train)
    
    ## need this correction because rulekit changes classes order randomly (maybe based on number of instances)
    y_train_proba_corrected = [None]*nclasses
    for class_ in range(nclasses):
        y_train_proba_corrected[class_] =  y_train_proba[:,mapper_dict[class_]]
    y_train_proba_corrected = np.array(y_train_proba_corrected).T
    
    # make predictions for val data
    y_val_pred = classifier.predict(X_val)
    y_val_proba = classifier.predict_proba(X_val)
    
    ## need this correction because rulekit changes classes order randomly (maybe based on number of instances)
    y_val_proba_corrected = [None]*nclasses
    for class_ in range(nclasses):
        y_val_proba_corrected[class_] =  y_val_proba[:,mapper_dict[class_]]
    y_val_proba_corrected = np.array(y_val_proba_corrected).T
    
    # make predictions for test data
    y_test_pred = classifier.predict(X_test)
    y_test_proba = classifier.predict_proba(X_test)
    
    ## need this correction because rulekit changes classes order randomly (maybe based on number of instances)
    y_test_proba_corrected = [None]*nclasses
    for class_ in range(nclasses):
        y_test_proba_corrected[class_] =  y_test_proba[:,mapper_dict[class_]]
    y_test_proba_corrected = np.array(y_test_proba_corrected).T
    
    # make binary labels
    label_binarizer = LabelBinarizer().fit(y_train)

    ################### RESULTS
    ## Train
    print("*********************** TRAIN ***********************")
    y_onehot_train = label_binarizer.transform(y_train)
    accuracy, MCC, f1, auroc, auprc = get_metrics(y_train, y_train_pred, y_train_proba_corrected, y_onehot_train)
    print("accuracy: ", accuracy)
    print("MCC: ", MCC)
    print("f1: ", f1)
    print("auroc: ", auroc)
    print("auprc: ", auprc)
    result['accuracy_train'][idx] = accuracy
    result['MCC_train'][idx] = MCC
    result['f1_train'][idx] = f1
    result['auroc_train'][idx] = auroc
    result['auprc_train'][idx] = auprc
        
    ## valid
    y_onehot_val = label_binarizer.transform(y_val)
    print("*********************** VALID ***********************")
    accuracy, MCC, f1, auroc, auprc = get_metrics(y_val, y_val_pred, y_val_proba_corrected, y_onehot_val)
    print("accuracy: ", accuracy)
    print("MCC: ", MCC)
    print("f1: ", f1)
    print("auroc: ", auroc)
    print("auprc: ", auprc)
    result['accuracy_val'][idx] = accuracy
    result['MCC_val'][idx] = MCC
    result['f1_val'][idx] = f1
    result['auroc_val'][idx] = auroc
    result['auprc_val'][idx] = auprc
    
    ## test
    y_onehot_test = label_binarizer.transform(y_test)
    print("*********************** TEST ***********************")
    accuracy, MCC, f1, auroc, auprc = get_metrics(y_test, y_test_pred, y_test_proba_corrected, y_onehot_test)
    print("accuracy: ", accuracy)
    print("MCC: ", MCC)
    print("f1: ", f1)
    print("auroc: ", auroc)
    print("auprc: ", auprc)
    result['accuracy_test'][idx] = accuracy
    result['MCC_test'][idx] = MCC
    result['f1_test'][idx] = f1
    result['auroc_test'][idx] = auroc
    result['auprc_test'][idx] = auprc
    
    print("Number of rules: ", len(ruleset.rules))
    result['nrules'][idx] = len(ruleset.rules)
    
    # Get the stats
    tmp_dict = vars(classifier.model.stats)
    result['rules_count'][idx] = tmp_dict['rules_count']
    result['conditions_per_rule'][idx] = tmp_dict['conditions_per_rule']
    result['induced_conditions_per_rule'][idx] = tmp_dict['induced_conditions_per_rule']
    result['avg_rule_coverage'][idx] = tmp_dict['avg_rule_coverage']
    result['avg_rule_precision'][idx] = tmp_dict['avg_rule_precision']
    result['avg_rule_quality'][idx] = tmp_dict['avg_rule_quality']
    result['pvalue'][idx] = tmp_dict['pvalue']
    
    # rules_df = pd.DataFrame({'rules':[str(rule) for rule in ruleset.rules if 'y = {2}' not in str(rule)]})
    rules_df = pd.DataFrame({'rules':[str(rule) for rule in ruleset.rules]})
    rules_df['conditions'] = [[condition.split(" = ")[0] for condition in re.sub("IF ", "", rule).split(" THEN ")[0].split(' AND ')] 
                              for rule in rules_df['rules']]
    rules_df['nconditions'] = [rule.count('AND')+1 for rule in rules_df['rules']]
    rules_df.to_csv('./../results/RuleKit/csvs/classes3/rules_class_seed'+str(seed)+'.csv')



Time for training:  115.15857529640198
*********************** TRAIN ***********************
accuracy:  0.9988179669030733
MCC:  0.9968688567089448
f1:  [0.9992242  0.99710145 1.        ]
auroc:  [0.99999229 0.99999137 1.        ]
auprc:  [0.9999975981587497, 0.9999662956386361, 1.0]
*********************** VALID ***********************
accuracy:  0.7569060773480663
MCC:  0.24544095898566085
f1:  [0.85324232 0.38095238 0.        ]
auroc:  [0.63506909 0.65915916 0.54428571]
auprc:  [0.8577759258677471, 0.34936521755617367, 0.0799886252843679]
*********************** TEST ***********************
accuracy:  0.7582417582417582
MCC:  0.25326513001785
f1:  [0.85423729 0.38709677 0.        ]
auroc:  [0.6240942  0.65964585 0.52897959]
auprc:  [0.8518193388199491, 0.3475407657849425, 0.05219780219780221]
Number of rules:  63
Time for training:  108.46986484527588
*********************** TRAIN ***********************
accuracy:  0.9988179669030733
MCC:  0.9968688567089448
f1:  [0.9992242  0.99710

# Results for rules

In [7]:

seeds = [999, 7, 42, 1995, 1303, 2405, 1996, 200, 0, 777]
rules_df = []
for seed in seeds:
    rules_df_ = pd.read_csv('./../results/RuleKit/rules_class_seed'+str(seed)+'.csv',index_col=0)
    rules_df_['seed'] = [seed]*len(rules_df_)
    rules_df = rules_df + [rules_df_]
rules_df = pd.concat(rules_df)

# Get rules min max conditions etc
print("Min, max rule counts", rules_df.value_counts('seed').min(), rules_df.value_counts('seed').max())
print("Min, max, median conditions counts", rules_df['nconditions'].min(), rules_df['nconditions'].max(), rules_df['nconditions'].median())

# Get rules class distribution
rules_df['class'] = rules_df["rules"].apply(lambda x: x[-2:-1])
df1 = rules_df.value_counts(['seed','class']).reset_index()
df2 = rules_df.value_counts(['seed']).reset_index()
rule_dist_df = df1.merge(df2, on='seed')
rule_dist_df['percent'] = rule_dist_df['count_x']/rule_dist_df['count_y']
print("% of class 0: ", rule_dist_df[rule_dist_df['class']=='0']['percent'].median())
print("% of class 1: ", rule_dist_df[rule_dist_df['class']=='1']['percent'].median())
print("% of class 2: ", rule_dist_df[rule_dist_df['class']=='2']['percent'].median())

Min, max rule counts 60 83
Min, max, median conditions counts 8 58 21.0
% of class 0:  0.7405329593267882
% of class 1:  0.20144927536231885
% of class 2:  0.05334281650071124
