In [53]:
import re
import os
import sys
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from tools.rnn import RecurrentNeuralNetwork

from sklearn.metrics import recall_score, precision_score, f1_score

In [49]:
def get_dataset(filepath):
    def get_pair(df):
        ys = np.array(df['Target'])
        df = df.drop('Target', axis=1)
        xs = np.array(df)
        return xs, ys
    tr_df, te_df = pd.read_pickle(filepath)
    tr_xs, tr_ys = get_pair(tr_df)
    te_xs, te_ys = get_pair(te_df)
    return tr_xs, tr_ys, te_xs, te_ys

def get_scores_map(model, opt, term):
    dir_path = os.path.join('data', '%s', 'time%d') % (opt, term)
    sample_n = sum(1 for v in os.listdir(dir_path) if re.compile(r'data\d{3}').match(v))
    scores_map = {'recall': [], 'precision': [], 'f1-score': []}
    for i in range(sample_n):
        filepath = os.path.join(dir_path, 'data%03d') % (i+1)
        tr_xs, tr_y, te_xs, te_y = get_dataset(filepath)
        model.fit(tr_xs, tr_y)
        preds = model.predict(te_xs)
        if all(preds == 0):
            scores_map['precision'].append(0)
            scores_map['f1-score'].append(0)
        else:
            scores_map['precision'].append(precision_score(te_y, preds))
            scores_map['f1-score'].append(f1_score(te_y, preds))
        scores_map['recall'].append(recall_score(te_y, preds))
        sys.stdout.write('\r%s' % filepath)
    print()
    return scores_map

def get_lr():
    model = LogisticRegression(
        solver='newton-cg',
        random_state=0
    )
    return model

def get_gb():
    model = GradientBoostingClassifier(
        n_iter_no_change=10,
        warm_start=True,
        random_state=0
    )
    return model

In [None]:
print(stats.ttest_rel(result1[:, 1], result2[:, 1]))

In [50]:
for opt in ['base', 'ours', 'text']:
    for term in [1, 2, 3, 4]:
        model = get_gb()
        scores_map = get_scores_map(model, opt, term)
        print({k: "%0.04f" % np.mean(scores_map[k]) for k in scores_map})
        print({k: "%0.04f" % np.std(scores_map[k]) for k in scores_map})
    print()


data\base\time1\data050
{'recall': '0.9667', 'precision': '0.9090', 'f1-score': '0.9271'}
{'recall': '0.1000', 'precision': '0.1299', 'f1-score': '0.0855'}
data\base\time2\data050
{'recall': '0.9733', 'precision': '1.0000', 'f1-score': '0.9840'}
{'recall': '0.0904', 'precision': '0.0000', 'f1-score': '0.0543'}
data\base\time3\data050
{'recall': '1.0000', 'precision': '0.9767', 'f1-score': '0.9853'}
{'recall': '0.0000', 'precision': '0.0943', 'f1-score': '0.0601'}
data\base\time4\data050
{'recall': '0.8200', 'precision': '0.9333', 'f1-score': '0.8507'}
{'recall': '0.2786', 'precision': '0.2108', 'f1-score': '0.2281'}

data\ours\time1\data050
{'recall': '0.9733', 'precision': '0.9950', 'f1-score': '0.9811'}
{'recall': '0.0904', 'precision': '0.0350', 'f1-score': '0.0570'}
data\ours\time2\data050
{'recall': '0.9733', 'precision': '1.0000', 'f1-score': '0.9840'}
{'recall': '0.0904', 'precision': '0.0000', 'f1-score': '0.0543'}
data\ours\time3\data050
{'recall': '0.9900', 'precision': '0.99

In [52]:
for opt in ['base', 'ours', 'text']:
    for term in [1, 2, 3, 4]:
        model = get_lr()
        scores_map = get_scores_map(model, opt, term)
        print({k: "%0.04f" % np.mean(scores_map[k]) for k in scores_map})
        print({k: "%0.04f" % np.std(scores_map[k]) for k in scores_map})
    print()


data\base\time1\data050
{'recall': '0.8600', 'precision': '0.5557', 'f1-score': '0.6526'}
{'recall': '0.2010', 'precision': '0.1904', 'f1-score': '0.1583'}
data\base\time2\data050
{'recall': '0.8667', 'precision': '0.8147', 'f1-score': '0.8197'}
{'recall': '0.1764', 'precision': '0.1795', 'f1-score': '0.1361'}
data\base\time3\data050
{'recall': '0.8800', 'precision': '0.9167', 'f1-score': '0.8800'}
{'recall': '0.2135', 'precision': '0.1863', 'f1-score': '0.1822'}
data\base\time4\data050
{'recall': '1.0000', 'precision': '0.8733', 'f1-score': '0.9227'}
{'recall': '0.0000', 'precision': '0.1718', 'f1-score': '0.1061'}

data\ours\time1\data050
{'recall': '0.8400', 'precision': '0.8205', 'f1-score': '0.8045'}
{'recall': '0.1914', 'precision': '0.2026', 'f1-score': '0.1512'}
data\ours\time2\data050
{'recall': '0.8667', 'precision': '0.9353', 'f1-score': '0.8852'}
{'recall': '0.1764', 'precision': '0.1241', 'f1-score': '0.1249'}
data\ours\time3\data050
{'recall': '0.8700', 'precision': '0.86