In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
from numpy.lib.npyio import load
from sklearn.linear_model import LogisticRegression
import os
import numpy as np
from sklearn import metrics
import json

In [3]:
def readJson(fname):
    data = []
    line_num = 0
    with open(fname, encoding="utf-8") as f:
        for line in f:
            # print("line", line)
            line_num += 1
            try:
                data.append(json.loads(line))
            except:
                print("error", line_num)
    return data

In [4]:
def load_train_feature_label(feature_label_file):
    x = []
    y = []
    user_item_pair = []

    with open(feature_label_file, "r") as f:
        for raw_line in f:
            line_data = json.loads(raw_line)
            assert len(line_data['f_hidden']) == 257
            x.append(line_data['f_hidden'][:-1])
            y.append(line_data['f_hidden'][-1])

    x = np.array(x)
    y = np.array(y)
    print("x size", x.shape)
    print("y size", y.shape)

    return x, y

In [5]:
train_input_file = "../../result/feature_purify/lr0.0005_06_22/feature_lambda_0.0/origin_train/train_f_hidden.json"
train_x, train_y = load_train_feature_label(train_input_file)

x size (24010, 256)
y size (24010,)


In [14]:
def load_test_feature_label(feature_label_file):
    x = []
    y = []
    user_item_pair = []

    data = readJson(feature_label_file)
    data_num = len(data)

    # feature ids for each user-item pair. List of 1-dim ndarrays
    x_ids = []
    # feature hidden embeddings for each feature of each user-item pair.
    # List of 2-dim ndarrays
    x = []
    # gt-feature ids for each user-item pair. List of 1-dim ndarrays
    y = []
    # we need the number of features in the top-selected sentences so that we can select the top-k features
    topk_y = []
    for i in range(data_num):
        data_i = data[i]
        pair_index = data_i['ui_pair_index']
        user_item_pair.append(pair_index)
        # get the feature ids and hidden embeddings
        feature_id_embed = data_i["feature"]
        feature_id_embed = np.array(feature_id_embed)
        feature_id = feature_id_embed[:, 0]
        feature_embed = feature_id_embed[:, 1: ]
        x_ids.append(feature_id)
        x.append(feature_embed)
        # get the number of features in the selected sentences
        topk_num = data_i["topk"]
        topk_y.append(topk_num)
        # get the feature ids in the gt sentences
        gt_feature_id = data_i["gt"]
        y.append(gt_feature_id)

    # x = np.array(x)
    # # y = np.array(y)
    # print("x size", x.shape)

    return x_ids, x, topk_y, y

In [15]:
test_input_file = "../../result/feature_purify/lr0.0005_06_22/feature_lambda_0.0/origin_test/test_f_hidden.json"
test_x_ids, test_x, test_topk_num, test_y = load_test_feature_label(test_input_file)

In [16]:
len(test_x_ids)

80

In [21]:
len(test_x_ids[0])

294

In [30]:
type(test_x_ids[0])

numpy.ndarray

In [18]:
len(test_x)

80

In [20]:
test_x[0].shape

(294, 256)

In [42]:
def train_model(x, y, max_iter):

    clf = LogisticRegression(random_state=0, max_iter=max_iter).fit(x, y)
    
    return clf

In [35]:
def iterate_eval_model(model, x_ids, x, y, test_topk_num):
    pair_num = len(x)
    precision_list = []
    recall_list = []
    f1_list = []
    auc = []

    for i in range(pair_num):
        x_i = x[i]          # feature hidden embeddings for each feature of each user-item pair.
        y_i = y[i]          # gt feature ids for each user-item pair.
        xid_i = x_ids[i]    # feature ids (in cdd sents) for each user-item pair.
        topk_i = test_topk_num[i]   # number of features in the 3gram selected sentences.

        preds_i = model.predict_proba(x_i)      # proba of each feature embedding's label. (n_samples, n_classes=2)
        # get the top-k indices of the predict proba of label 1
        idx_i = np.argpartition(preds_i[:,1], -topk_i)[-topk_i:]  # Indices not sorted
        # sort the indices so that the corresponding predict probas are in an descending order
        topk_preds_idx_i = idx_i[np.argsort(preds_i[:,1][idx_i])][::-1] 
        # get the top-k's corresponding feature ids
        topk_preds_i = xid_i[topk_preds_idx_i]
        
        TP = set(topk_preds_i).intersection(set(y_i))

        precision = len(TP)/topk_i
        recall = len(TP)/len(y_i)

        if (precision+recall) != 0.0:
            f1 = 2*precision*recall/(precision+recall)
        else:
            f1 = 0.0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)
    
    return avg_precision, avg_recall, avg_f1

In [43]:
lr_model = train_model(train_x, train_y, max_iter=1000)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [36]:
precision, recall, f1 = iterate_eval_model(lr_model, test_x_ids, test_x, test_y, test_topk_num)

In [40]:
print("Precision: {:.4} \t Recall: {:.4} \t F1: {:.4}".format(
    precision, recall, f1
))

Precision: 0.2731 	 Recall: 0.1888 	 F1: 0.2103
