In [1]:
import pandas as pd


In [6]:
news_df = pd.read_csv(r"C:\Users\Rui\Documents\Explainable_AI\ExplainableNRS\dataset\MIND\small\news.csv")
test_behavior = pd.read_csv(
    r"C:\Users\Rui\Documents\Explainable_AI\ExplainableNRS\dataset\MIND\small\test\behaviors.tsv", sep="\t",
    header=None, names=["impression_id", "user_id", "time", "history", "candidate"])
valid_behavior = pd.read_csv(
    r"C:\Users\Rui\Documents\Explainable_AI\ExplainableNRS\dataset\MIND\small\valid\behaviors.tsv", sep="\t",
    header=None, names=["impression_id", "user_id", "time", "history", "candidate"])
print("Length of valid behavior: ", len(test_behavior))

Length of valid behavior:  36576


In [102]:
news_df.columns

Index(['news_id', 'category', 'subvert', 'title', 'abstract', 'url', 'entity',
       'ab_entity', 'body'],
      dtype='object')

In [3]:
news_topic = dict(zip(news_df.news_id.tolist(), news_df.category.tolist()))

In [39]:
def topic_matching(history, candidate, news_topic_mapper):
    clicked_topic = set([news_topic_mapper[news_id.split("-")[0]] for news_id in candidate.split(" ") if int(news_id.split("-")[1])])
    for news_id in history.split(" "):
        n_topic = news_topic_mapper.get(news_id, "")
        if n_topic in clicked_topic:
            return 1
    return 0

# cold-user, short candidate list, topic-matching
def group_user(behavior_df, news_topic_mapper):
    behavior_df = behavior_df.fillna(" ")
    behavior_df["history_count"] = behavior_df["history"].apply(lambda x: len(x.split(" ")) if x != " " else 0)
    behavior_df["candidate_count"] = behavior_df["candidate"].apply(lambda x: len(x.split(" ")) if x != " " else 0)
    # topic_matching
    behavior_df["topic_matching"] = behavior_df.apply(lambda x: topic_matching(x["history"], x["candidate"], news_topic_mapper), axis=1)
    return behavior_df 


In [105]:
from itertools import product
def sample_group_user(behavior_df, sample_num, **kwargs):
    behavior_df = behavior_df[behavior_df.topic_matching == kwargs.get("topic_matching", 1)]
    sample_df = pd.DataFrame()
    history_count = kwargs.get("history_count", range(1, 11))
    candidate_count = kwargs.get("candidate_count", range(2, 12))
    sample_method = kwargs.get("sample_method", "random")
    if sample_method == "random":
        behavior_df = behavior_df[(min(history_count) <= behavior_df.history_count) & (behavior_df.history_count <= max(history_count))]
        behavior_df = behavior_df[(min(candidate_count) <= behavior_df.candidate_count) & (behavior_df.candidate_count <= max(candidate_count))]
        sample_df = behavior_df.sample(sample_num)
    else:
        num = int(sample_num / (len(history_count) * len(candidate_count)))
        for (h, c) in product(history_count, candidate_count):
            print(h, c, len(behavior_df[(behavior_df.history_count == h) & (behavior_df.candidate_count == c)]))
            df = behavior_df[(behavior_df.history_count == h) & (behavior_df.candidate_count == c)].sample(num)
            sample_df = pd.concat([sample_df, df])
    return sample_df


In [153]:
import numpy as np
from collections import defaultdict
def load_news_info(news_ids):
    news_info = defaultdict(list)
    for news_id in news_ids:
        news_info["news_id"].append(news_id)
        news_info["title"].append(news_df[news_df['news_id'] == news_id]['title'].values[0])
        news_info["category"].append(news_df[news_df['news_id'] == news_id]['category'].values[0])
        news_info["subvert"].append(news_df[news_df['news_id'] == news_id]['subvert'].values[0])
        news_info["abstract"].append(news_df[news_df['news_id'] == news_id]['abstract'].values[0])
    return news_info

def get_history_candidate_info(line, shuffle=False):
    history_news = load_news_info(line["history"].split())
    history_news = {f"history_{k}": v for k, v in history_news.items()}
    candidate_split = line["candidate"].split()
    if shuffle:
        np.random.shuffle(candidate_split)
    cand_news_index = [i.split("-")[0] for i in candidate_split]
    candidate_news = load_news_info(cand_news_index)
    candidate_news = {f"candidate_{k}": v for k, v in candidate_news.items()}
    cand_label = [i for i, l in enumerate(candidate_split) if int(l.split("-")[1])]
    return history_news, candidate_news, cand_label


In [160]:
import os
num_per_group = 100
test_group_root = "test_group/"
os.makedirs(test_group_root, exist_ok=True)
test_groups1 = {
    "cold_user_short_candidate_match_topic": {"history_count": range(1, 6), "candidate_count": range(5, 16), "topic_matching": 1},
    "non_cold_user_short_candidate_match_topic": {"history_count": range(6, 26), "candidate_count": range(5, 16), "topic_matching": 1},
    "cold_user_short_candidate_not_match_topic": {"history_count": range(1, 6), "candidate_count": range(5, 16), "topic_matching": 0},
    "non_cold_user_short_candidate_not_match_topic": {"history_count": range(6, 26), "candidate_count": range(5, 16), "topic_matching": 0},
    "cold_user_long_candidate_match_topic": {"history_count": range(1, 6), "candidate_count": range(16, 26), "topic_matching": 1},
    "non_cold_user_long_candidate_match_topic": {"history_count": range(6, 26), "candidate_count": range(16, 26), "topic_matching": 1},
    "cold_user_long_candidate_not_match_topic": {"history_count": range(1, 6), "candidate_count": range(16, 26), "topic_matching": 0},
    "non_cold_user_long_candidate_not_match_topic": {"history_count": range(6, 26), "candidate_count": range(16, 26), "topic_matching": 0},
}

In [166]:
test_groups2 = {
    "cold_user-match_topic": {"history_count": range(1, 6), "candidate_count": range(5, 26), "topic_matching": 1},
    "non_cold_user-match_topic": {"history_count": range(6, 26), "candidate_count": range(5, 26), "topic_matching": 1},
    "cold_user-not_match_topic": {"history_count": range(1, 6), "candidate_count": range(5, 26), "topic_matching": 0},
    "non_cold_user-not_match_topic": {"history_count": range(6, 26), "candidate_count": range(5, 26), "topic_matching": 0},
}

In [162]:
sample_cols = ["impression_id", "history", "candidate", "label", "history_news_id", "history_title", "history_category", "history_subvert", "history_abstract", "candidate_news_id", "candidate_title", "candidate_category", "candidate_subvert", "candidate_abstract"]
max_num = 10
for group_name, kwargs in test_groups1.items():
    print(group_name)
    group_df = sample_group_user(test_behavior, num_per_group, **kwargs)
    group_df.fillna(" ", inplace=True)
    sample_group = []
    cur_num = defaultdict(lambda : 0)
    group_df = group_df.sort_values(by=["candidate_count"])
    for _, row in group_df.iterrows():
        flag = 1
        group_history, group_candidates, group_labels = get_history_candidate_info(row)
        while flag:
            flag2 = 0
            for label in group_labels:
                if cur_num[label] >= max_num:
                    flag2 = 1
                    break
            if flag2:
                group_history, group_candidates, group_labels = get_history_candidate_info(row, shuffle=True)
            else:
                for label in group_labels:
                    cur_num[label] += 1
                flag = 0
        history_input = "\n".join(f"H{no+1}: {title}" for no, title in enumerate(group_history["history_title"]))
        candidate_input = "\n".join(f"C{no+1}: {title}" for no, title in enumerate(group_candidates["candidate_title"]))
        sample = [row["impression_id"], history_input, candidate_input, ",".join([f"C{index+1}" for index in group_labels])]
        sample.extend(["\n".join([v if isinstance(v, str) else "" for v in value]) for key, value in group_history.items()])
        sample.extend(["\n".join([v if isinstance(v, str) else "" for v in value]) for key, value in group_candidates.items()])
        sample_group.append(sample)
    sample_group = pd.DataFrame(sample_group, columns=sample_cols)
    sample_group.to_csv(test_group_root + group_name + ".csv", index=False)

cold_user_short_candidate_match_topic
non_cold_user_short_candidate_match_topic
cold_user_short_candidate_not_match_topic
non_cold_user_short_candidate_not_match_topic
cold_user_long_candidate_match_topic
non_cold_user_long_candidate_match_topic
cold_user_long_candidate_not_match_topic
non_cold_user_long_candidate_not_match_topic


In [168]:
variant1_dir = "test_group/variant1/"
variant2_dir = "test_group/variant2/"

for group_name, kwargs in test_groups2.items():
    names = group_name.split("-")
    short_df = pd.read_csv(variant1_dir + f"{names[0]}_short_candidate_{names[1]}" + ".csv")
    short_df.to_csv(variant1_dir + f"{names[0]}-short_candidate-{names[1]}" + ".csv", index=False)
    long_df = pd.read_csv(variant1_dir + f"{names[0]}_long_candidate_{names[1]}" + ".csv")
    long_df.to_csv(variant1_dir + f"{names[0]}-long_candidate-{names[1]}" + ".csv", index=False)
    all_df = pd.concat([short_df, long_df])
    all_df.sample(num_per_group).to_csv(variant2_dir + f"{names[0]}-{names[1]}" + ".csv", index=False)
            

In [40]:
test_behavior = group_user(test_behavior, news_topic)

In [41]:
test_behavior.topic_matching.value_counts()

topic_matching
1    28871
0     7705
Name: count, dtype: int64

In [46]:
test_behavior[test_behavior.topic_matching == 1].history_count.value_counts()

history_count
6      883
9      815
5      810
7      804
8      785
      ... 
285      1
286      1
214      1
434      1
363      1
Name: count, Length: 279, dtype: int64

In [42]:
test_behavior[test_behavior.topic_matching == 0].history_count.value_counts()

history_count
0      1117
3       663
4       623
2       546
5       519
       ... 
92        1
96        1
119       1
108       1
76        1
Name: count, Length: 109, dtype: int64

In [17]:
test_behavior.history_count.value_counts()

history_count
4      1354
6      1333
5      1329
3      1301
7      1196
       ... 
285       1
286       1
214       1
434       1
363       1
Name: count, Length: 280, dtype: int64

In [32]:
test_behavior.candidate_count.value_counts()

candidate_count
7      2038
2      1929
11     1299
10     1217
5      1125
       ... 
257       1
285       1
293       1
199       1
249       1
Name: count, Length: 283, dtype: int64

In [18]:
fresh_user_ratio = len(test_behavior[test_behavior.history_count == 0]) / len(test_behavior)
print("Fresh user: ", fresh_user_ratio)

Fresh user:  0.03053915135608049


In [20]:
cold_num = 5
cold_user_ratio = len(test_behavior[test_behavior.history_count <= cold_num]) / len(test_behavior)
print("Cold user: ", cold_user_ratio)

Cold user:  0.17806758530183728


In [21]:
short_candidate_ratio = len(test_behavior[test_behavior.candidate_count <= 10]) / len(test_behavior)
print("Short candidate: ", short_candidate_ratio)

Short candidate:  0.25229658792650916


In [43]:
gpt_4_1000_result = pd.read_csv("generated_data/old/sampled_1000_gpt-4.csv")

In [44]:
def stat_df(df):
    df["history_count"] = df["history"].apply(lambda x: len(x.split("\n")) if x != " " else 0)
    df["candidate_count"] = df["candidate"].apply(lambda x: len(x.split("\n")) if x != " " else 0)
    return df
gpt_4_1000_result = stat_df(gpt_4_1000_result)

In [77]:
def result_stat(df, history_num, short_candidate_num):
    print("Number of Cold user and short candidate:", len(df[(df.history_count <= history_num) & (df.candidate_count <= short_candidate_num)]))
    print("Number of Cold user and long candidate:", len(df[(df.history_count <= history_num) & (df.candidate_count > short_candidate_num)]))
    print("Number of Non-cold user and short candidate:", len(df[(df.history_count > history_num) & (df.candidate_count <= short_candidate_num)]))
    print("Number of Non-cold user and long candidate:", len(df[(df.history_count > history_num) & (df.candidate_count > short_candidate_num)]))
    print("#Short candidate: ", len(df[df.candidate_count <= short_candidate_num]) / len(df))
    mrr = {"cold_user": df[df.history_count <= history_num]["MRR"].mean().round(3),
           "non_cold_user": df[df.history_count > history_num]["MRR"].mean().round(3),
           "short_candidate": df[df.candidate_count <= short_candidate_num]["MRR"].mean().round(3),
           "long_candidate": df[df.candidate_count > short_candidate_num]["MRR"].mean().round(3),
           "cold_user_short_candidate": df[(df.history_count <= history_num) & (df.candidate_count <= short_candidate_num)]["MRR"].mean().round(3),
           "cold_user_long_candidate": df[(df.history_count <= history_num) & (df.candidate_count > short_candidate_num)]["MRR"].mean().round(3),
           "non_cold_user_short_candidate": df[(df.history_count > history_num) & (df.candidate_count <= short_candidate_num)]["MRR"].mean().round(3),
           "non_cold_user_long_candidate": df[(df.history_count > history_num) & (df.candidate_count > short_candidate_num)]["MRR"].mean().round(3)}
    return mrr

In [78]:
max_candidate_num = 10
max_history_num = 5
print("GPT-4 1000 sampled result: ", result_stat(gpt_4_1000_result, max_history_num, max_candidate_num))

Number of Cold user and short candidate: 50
Number of Cold user and long candidate: 112
Number of Non-cold user and short candidate: 203
Number of Non-cold user and long candidate: 635
#Short candidate:  0.253
GPT-4 1000 sampled result:  {'cold_user': 0.289, 'non_cold_user': 0.312, 'short_candidate': 0.56, 'long_candidate': 0.223, 'cold_user_short_candidate': 0.493, 'cold_user_long_candidate': 0.198, 'non_cold_user_short_candidate': 0.577, 'non_cold_user_long_candidate': 0.227}


In [80]:
gpt_3_5_1000_t4_result = pd.read_csv("generated_data/template-4_gpt-3.5-turbo_order_temperature-0.csv")
gpt_3_5_1000_t4_result.rename(columns={"mean_mrr": "MRR"}, inplace=True)
print("GPT-3.5 1000 sampled result: ", result_stat(stat_df(gpt_3_5_1000_t4_result), max_history_num, max_candidate_num))

Number of Cold user and short candidate: 50
Number of Cold user and long candidate: 112
Number of Non-cold user and short candidate: 203
Number of Non-cold user and long candidate: 635
#Short candidate:  0.253
GPT-3.5 1000 sampled result:  {'cold_user': 0.281, 'non_cold_user': 0.262, 'short_candidate': 0.521, 'long_candidate': 0.178, 'cold_user_short_candidate': 0.523, 'cold_user_long_candidate': 0.173, 'non_cold_user_short_candidate': 0.521, 'non_cold_user_long_candidate': 0.179}


In [81]:
gpt_3_5_1000_result = pd.read_csv("generated_data/old/sampled_1000_gpt-3.5-turbo.csv")
gpt_3_5_1000_result.rename(columns={"mean_mrr": "MRR"}, inplace=True)
print("GPT-3.5 1000 sampled result: ", result_stat(stat_df(gpt_3_5_1000_result), max_history_num, max_candidate_num))

Number of Cold user and short candidate: 50
Number of Cold user and long candidate: 112
Number of Non-cold user and short candidate: 203
Number of Non-cold user and long candidate: 635
#Short candidate:  0.253
GPT-3.5 1000 sampled result:  {'cold_user': 0.229, 'non_cold_user': 0.224, 'short_candidate': 0.408, 'long_candidate': 0.169, 'cold_user_short_candidate': 0.467, 'cold_user_long_candidate': 0.144, 'non_cold_user_short_candidate': 0.397, 'non_cold_user_long_candidate': 0.173}
