In [1]:
import numpy as np
import pandas as pd

In [39]:
def merge_input_data(csv_path):
    img_features = pd.read_csv(csv_path, index_col=0)

    img_features = img_features.assign(key=1).merge(img_features.assign(key=1), on="key", suffixes=["_A", "_B"]).drop("key", axis=1)
    img_features = img_features[img_features["img_id_A"] != img_features["img_id_B"]]
    
    return img_features


In [40]:
img_features = merge_input_data("./HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv")

In [41]:
def read_labeled_data(csv_paths):
    labeled_data = pd.read_csv(csv_paths[0])
    for path in csv_paths[1:]:
        labeled_data = pd.concat([labeled_data, pd.read_csv(path)])
    
    return labeled_data

In [42]:
csv_paths = ["./HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv", "./HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv"]
labeled_data = read_labeled_data(csv_paths)

In [45]:
def filter_writer_pairs(img_features, labeled_data):
    # filter from all combinations of writer pairs
    a = img_features["img_id_A"] + img_features["img_id_B"]
    b = labeled_data["img_id_A"] + labeled_data["img_id_B"]
    feature_set = img_features[a.isin(b)]
    
    feature_set = pd.merge(feature_set, labeled_data, on=["img_id_A", "img_id_B"])
    
    feature_set["writer_A"] = [elm[:4] for elm in feature_set["img_id_A"]]
    return feature_set

In [47]:
feature_set = filter_writer_pairs(img_features, labeled_data)

In [48]:
def train_val_test_split(feature_set):
    unique = feature_set["writer_A"].unique()
    unique = np.random.permutation(unique)

    # training validation and test sets split
    tr_idx = int(0.8 * unique.shape[0])
    tr_s = unique[:tr_idx]

    training_set = feature_set.loc[feature_set["writer_A"].isin(tr_s)]


    val_idx = tr_idx + int(0.1 * unique.shape[0])
    val_s = unique[tr_idx: val_idx]
    validation_set = feature_set.loc[feature_set["writer_A"].isin(val_s)]


    test_s = unique[val_idx:]
    test_set = feature_set.loc[feature_set["writer_A"].isin(test_s)]
    
    del training_set["writer_A"]
    del validation_set["writer_A"]
    del test_set["writer_A"]
    
    return training_set, validation_set, test_set

In [49]:
training_set, validation_set, test_set = train_val_test_split(feature_set)

In [51]:
def get_features_and_labels(dataset):
    feature_columns = [ ("f%d_%s" % (idx, label)) for label in ["A", "B"] for idx in range(1, 10) ]

    return np.array(training_set.loc[:, feature_columns]), np.array(training_set["target"])


In [52]:
X_train, y_train = get_features_and_labels(training_set)
X_val, y_val = get_features_and_labels(validation_set)
X_test, y_test = get_features_and_labels(test_set)
# i = 0
# num = X_train.shape[0] // 100

# loss = 0
# while i < X_train.shape[0]:
#     x_tr = X_train[i:i+num]
#     y_tr = y_train[i:i+num]
#     W = np.ones((x_tr.shape[1], 1))
#     prd = np.dot(x_tr, W)
#     h_x = 1 / (1 + np.exp(-prd))

# #     loss += 
#     print(np.sum(-np.log(h_x) * y_tr - np.log(1 - h_x) * (1 - y_tr)))
#     i += num

# print(loss)
# # from scipy.sparse import csr_matrix

# # feature_columns = [ ("f%d_%s" % (idx, label)) for label in ["A", "B"] for idx in range(1, 10) ]

# # X_train, y_train = csr_matrix(training_set.loc[:, feature_columns]), csr_matrix(training_set["target"])


# # W = csr_matrix(np.ones((X_train.shape[1], 1)))

# # prd = np.dot(X_train, W)

# # h_x = 1 / (2 + np.expm1(-prd))


# # # np.dot(X_train)
# # loss = np.sum(-y_train * np.log(h_x) - (1 - y) * np.log(1 - h_x))
# # print(loss)