In [2]:
import re
import os
import sys
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from tools.rnn import RecurrentNeuralNetwork

from sklearn.metrics import recall_score, precision_score, f1_score

In [3]:
def get_dataset(filepath):
    def get_pair(df):
        ys = np.array(df['Target'])
        df = df.drop('Target', axis=1)
        xs = np.array(df)
        return xs, ys
    tr_df, te_df = pd.read_pickle(filepath)
    tr_xs, tr_ys = get_pair(tr_df)
    te_xs, te_ys = get_pair(te_df)
    return tr_xs, tr_ys, te_xs, te_ys

def get_scores_map(model, opt, term):
    dir_path = os.path.join('data', '%s', 'time%d') % (opt, term)
    sample_n = sum(1 for v in os.listdir(dir_path) if re.compile(r'data\d{3}').match(v))
    scores_map = {'recall': [], 'precision': [], 'f1-score': []}
    for i in range(sample_n):
        filepath = os.path.join(dir_path, 'data%03d') % (i+1)
        tr_xs, tr_ys, te_xs, te_ys = get_dataset(filepath)
        model.fit(tr_xs, tr_ys)
        preds = model.predict(te_xs)
        if all(preds == 0):
            scores_map['precision'].append(0)
            scores_map['f1-score'].append(0)
        else:
            scores_map['precision'].append(precision_score(te_ys, preds))
            scores_map['f1-score'].append(f1_score(te_ys, preds))
        scores_map['recall'].append(recall_score(te_ys, preds))
        sys.stdout.write('\r%s' % filepath)
    return scores_map

def get_lr(opt, term):
    model = LogisticRegression(
        solver='newton-cg',
        random_state=0
    )
    return model, 'lr'

def get_gb(opt, term):
    model = GradientBoostingClassifier(
        n_iter_no_change=10,
        warm_start=True,
        random_state=0
    )
    return model, 'gb'

def get_nn(opt, term):
    model = RecurrentNeuralNetwork(opt, term)
    return model, 'nn'

In [1]:
total_score_map = {}
for get_model in [get_lr, get_gb, get_nn]:
    print(get_model)
    for opt in ['base', 'ours', 'text']:
        print(opt)
        for term in [1, 2, 3, 4]:
            model, name = get_model(opt, term)
            scores_map = get_scores_map(model, opt, term)
            print("\r", {
                k: "%0.04f(%0.04f)" % (np.mean(scores_map[k]), np.std(scores_map[k]))
                for k in scores_map
            })
            total_score_map[name, opt, term] = scores_map
            


NameError: name 'get_lr' is not defined

In [None]:
pd.to_pickle(os.path.join('data', 'total_score_map'))