In [None]:
CONFIG_PATH = "configs/small_sample_sub_minority.yml"
from utils import get_config, join_sets, load_model, print_log, save_model, load_model_dict
config = get_config(CONFIG_PATH, "_curation", print_config = False)
active_user_votes_thres = config["active_user_votes_thres"]
batch_size = config["batch_size"]

In [None]:
from collections import Counter, defaultdict
import os
import random
import shutil
import numpy as np
import pandas as pd
from superdebug import debug
import torch
from process_data import get_model_input
from model import get_best_model
from matplotlib import pyplot as plt
from venn import venn, pseudovenn
import time
import re


##### Preprocess

Load model and data

In [None]:
target, original_feature_map, categorical_features, string_features, train_data, test_data, test_data_info, train_submission_upvote_df, num_all_users = get_model_input(config)
extra_input = (categorical_features, string_features, target)
model, token_embedding = get_best_model(config, categorical_features, string_features, original_feature_map)
model.eval()
all_users = list(range(num_all_users + 1))

[1;33m------------------[0m[1;31m 2022-08-08 07:15:01 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/process_data.py:349 get_model_input[0m[1;33m[0m
[1;33m[0m[1;36mLoading prepared data...[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:15:01 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:15:04 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['original_token_num'], at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/model.py:248 get_tokenizer[0m[1;33m[0m
[1;33m0 / 1.[0m [1;33m [0m[1;36moriginal_token_num[0m[1;33m num val: 30522[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:15:04 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:15:05 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['latest_token_num'], at 

Some weights of the model checkpoint at prajjwal1/bert-mini were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1;33m------------------[0m[1;31m 2022-08-08 07:15:06 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:110 load_model[0m[1;33m[0m
[1;33m[0m[1;36mLoading best model...[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:15:06 [0m[1;33m------------------[0m


Collect submissions and active users in different subreddits

In [None]:
def get_subreddits_submissions(train_data:pd.DataFrame, test_data:pd.DataFrame, user_votes_thres = 0):
    subreddit_votes_counter = Counter()
    subreddit_active_users = defaultdict(Counter)
    subreddit_train_submissions = defaultdict(dict)
    subreddit_test_submissions = defaultdict(dict)
    all_submissions = dict()
    for i, row in train_data.iterrows():
        subreddit_votes_counter[row["SUBREDDIT"]] += 1
        subreddit_active_users[row["SUBREDDIT"]][row["USERNAME"]] += 1
        if row["SUBMISSION_ID"] not in subreddit_train_submissions[row["SUBREDDIT"]]:
            subreddit_train_submissions[row["SUBREDDIT"]][row["SUBMISSION_ID"]] = row
            all_submissions[row["SUBMISSION_ID"]] = row
    for subreddit in subreddit_active_users:
        users_vote_count = subreddit_active_users[subreddit]
        subreddit_active_users[subreddit] = {user for user in users_vote_count if users_vote_count[user] >= user_votes_thres}
    for i, row in test_data.iterrows():
        if row["SUBMISSION_ID"] not in subreddit_test_submissions[row["SUBREDDIT"]]:
            subreddit_test_submissions[row["SUBREDDIT"]][row["SUBMISSION_ID"]] = row
            all_submissions[row["SUBMISSION_ID"]] = row
    return subreddit_votes_counter, subreddit_active_users, subreddit_train_submissions, subreddit_test_submissions, all_submissions

subreddit_votes_counter, subreddit_active_users, subreddit_train_submissions, subreddit_test_submissions, all_submissions = get_subreddits_submissions(train_data, test_data, user_votes_thres = active_user_votes_thres) # subreddit_votes_counter, subreddit_users, subreddit_train_submissions are based on train_data, subreddit_test_submissions are based on test_data

Record down existing votes

So that we can use them to substitute the predicted votes

In [None]:
def record_existing_votes(train_data:pd.DataFrame):
    # collect existing votes
    existing_votes = {}
    existing_user_updown_votes = defaultdict(Counter)
    existing_user_votes = Counter()
    existing_submission_votes = defaultdict(Counter)
    usernames = train_data["USERNAME"].to_list()
    sub_ids = train_data["SUBMISSION_ID"].to_list()
    votes = train_data["VOTE"].to_list()
    for row_i in range(len(train_data)):
        existing_votes[f'{usernames[row_i]}-{sub_ids[row_i]}'] = votes[row_i]
        existing_user_updown_votes[usernames[row_i]][votes[row_i]] += 1
        existing_user_votes[usernames[row_i]] += 1
        existing_submission_votes[sub_ids[row_i]][votes[row_i]] += 1
    return existing_votes, existing_user_votes, existing_user_updown_votes, existing_submission_votes
existing_votes, existing_user_votes, existing_user_updown_votes, existing_submission_votes = record_existing_votes(train_data)


#### Predict votes for all the users on all submissions

Define required functions

In [None]:
from typing import Union
# from tqdm import tqdm
from tqdm.notebook import tqdm
from train import evaluate_model
from model import get_tokenizer

def convert_group_users_subreddit_submissions_data(group_users:Union[set,list], unique_submissions:dict):
    group_users_submissions_data = []
    if type(unique_submissions) == dict:
        unique_submissions = pd.DataFrame(list(unique_submissions.values()))

    for user in tqdm(group_users):
        # for submission_id in unique_submissions:
        #     submission:pd.DataFrame = unique_submissions[submission_id].copy(deep=True)
        #     submission["USERNAME"] = user
        #     group_users_submissions_data.append(submission)
        submissions = unique_submissions.copy(deep=True)
        submissions["USERNAME"] = [user] * len(submissions) # it doesn't matter whether the user itself is in UPVOTED_USERS / DOWNVOTED_USERS: we will substitute it with real votes
        group_users_submissions_data.append(submissions)

    group_users_submissions_data = pd.concat(group_users_submissions_data,axis=0)
    return group_users_submissions_data
def predict_group_users_submissions_votes(model, group_users_submissions_data, batch_size):
    # predict unseen votes
    return evaluate_model(config, model, data=group_users_submissions_data, weights = None, batch_size=config["batch_size"], sample_voted_users=False, extra_input = extra_input, ret = "prediction") # ndarray size: (3423664, 1)
pred_all_user_submission_vote_score_matrix = None

# model.device = "cuda:0"
def get_group_user_submission_vote_score_matrix(predicted_group_users_submissions_votes:np.ndarray, group_users, group_users_submissions_data:pd.DataFrame, existing_votes, existing_pred_user_submission_vote_score_matrix = None):
    if existing_pred_user_submission_vote_score_matrix is not None:
        pred_user_submission_vote_score_matrix = existing_pred_user_submission_vote_score_matrix
    else:
        all_sub_ids = group_users_submissions_data["SUBMISSION_ID"].unique()
        pred_user_submission_vote_score_matrix = pd.DataFrame(- np.ones([max(group_users) + 1, len(all_sub_ids)], dtype = float), columns=all_sub_ids)
    pred_user_submission_vote_matrix = pd.DataFrame(- np.ones([max(group_users) + 1, len(all_sub_ids)], dtype = int), columns=all_sub_ids) # use ground truth vote if available, -1 for not in data
    each_submission_votes = {}
    each_user_confidence = defaultdict(list)
    submission_ids = group_users_submissions_data["SUBMISSION_ID"].to_numpy()
    usernames = group_users_submissions_data["USERNAME"].to_numpy()
    for row_i in tqdm(range(len(group_users_submissions_data))):
        submission_id = submission_ids[row_i]
        username = usernames[row_i]
        if existing_pred_user_submission_vote_score_matrix is not None:
            vote_score = existing_pred_user_submission_vote_score_matrix[username, submission_id]
        else:
            vote_score = predicted_group_users_submissions_votes[row_i, 0]
            pred_user_submission_vote_score_matrix[username, submission_id] = vote_score
        if f'{username}-{submission_id}' not in existing_votes:
            vote = int(vote_score >= 0.5)
        else: # use existing votes if available
            vote = int(existing_votes[f'{username}-{submission_id}'] >= 0.5)
            
        if submission_id not in each_submission_votes:
            each_submission_votes[submission_id] = [0, 0]
        each_user_confidence[username].append(abs(vote_score - 0.5))
        pred_user_submission_vote_matrix.at[username, submission_id] = vote
        each_submission_votes[submission_id][vote] += 1
    assert (pred_user_submission_vote_matrix.to_numpy() != -1).sum() > 0
    
    # analyze user confidence
    for username in each_user_confidence:
        each_user_confidence[username] = float(np.mean(each_user_confidence[username]))
        
    # calculate %upvotes for each submission
    for submission_id in each_submission_votes:
        each_submission_votes[submission_id].append(each_submission_votes[submission_id][1] / (each_submission_votes[submission_id][0] + each_submission_votes[submission_id][1])) # %upvotes
    return each_submission_votes, each_user_confidence, pred_user_submission_vote_score_matrix, pred_user_submission_vote_matrix
def get_group_users_preferred_submissions(each_submission_votes:dict, thres = 0.9):
    # sort submissions using %upvotes
    group_submissions_ranking = list(each_submission_votes.keys())
    group_submissions_ranking.sort(reverse=True, key=lambda id: each_submission_votes[id][-1])
    
    # include submissions to preferred_submissions where %upvotes is higher than threshold
    group_preferred_submissions = set()
    for submission_id in group_submissions_ranking:
        if each_submission_votes[submission_id][-1] >= thres:
            group_preferred_submissions.add(submission_id)
    
    return group_preferred_submissions, group_submissions_ranking

def get_group_users_real_vote(group_users:Union[set,list], unique_submissions:dict, existing_votes, metric = "upvote_rate"):
    all_sub_ids = list(unique_submissions.keys())
    pred_user_submission_vote_score_matrix = pd.DataFrame(- np.ones([max(group_users) + 1, len(all_sub_ids)], dtype = float), columns=all_sub_ids)
    pred_user_submission_vote_matrix = pd.DataFrame(- np.ones([max(group_users) + 1, len(all_sub_ids)], dtype = int), columns=all_sub_ids)
    each_submission_votes = {}
    for submission_id in unique_submissions:
        each_submission_votes[submission_id] = [0, 0]
    for username in group_users:
        for submission_id in unique_submissions:
            if f'{username}-{submission_id}' in existing_votes:
                vote = int(existing_votes[f'{username}-{submission_id}'] >= 0.5)
                pred_user_submission_vote_matrix.at[username, submission_id] = vote
                pred_user_submission_vote_score_matrix[username, submission_id] = vote
                each_submission_votes[submission_id][vote] += 1
                
    each_user_confidence = {user: [0.5] for user in group_users}
    
    # calculate %upvotes for each submission
    for submission_id in each_submission_votes:
        if metric == "upvote_rate":
            if each_submission_votes[submission_id][0] + each_submission_votes[submission_id][1] == 0:
                metric_res = -1
            else:
                metric_res = each_submission_votes[submission_id][1] / (each_submission_votes[submission_id][0] + each_submission_votes[submission_id][1])
        elif metric == "#upvote-#downvote":
            metric_res = each_submission_votes[submission_id][1] - each_submission_votes[submission_id][0]
        each_submission_votes[submission_id].append(metric_res)
        
    
    return each_submission_votes, each_user_confidence, pred_user_submission_vote_score_matrix, pred_user_submission_vote_matrix

Convert data to model input, then run model to make predictions, and obtain prediction score matrix and vote matrix. Note that we use actual votes to replace predicted votes when available.

_!!! This process can be time consuming & need more than 200G memory for medium sized dataset, and is not necessary if we use user_embedding to cluster active users_

In [None]:
if config["user_grouping_method"] == "predict_all_submissions":
    # Convert data to model input
    # TODO: not all users
    all_users_submissions_data = convert_group_users_subreddit_submissions_data(all_users, all_submissions)

    # run model to make predictions
    model.to(model.device)
    predicted_all_users_submissions_votes = predict_group_users_submissions_votes(model, all_users_submissions_data, batch_size)
    debug(predicted_all_users_submissions_votes=predicted_all_users_submissions_votes)

    import pickle
    pickle.dump(predicted_all_users_submissions_votes, open("output/predicted_all_users_submissions_votes.pt", "wb"))
    # predicted_all_users_submissions_votes = pickle.load(open("output/predicted_all_users_submissions_votes.pt", "rb"))

    # Obtain prediction score matrix and vote matrix. We use actual votes to replace predicted votes when available
    all_submission_votes, all_users_confidence, pred_all_user_submission_vote_score_matrix, pred_all_user_submission_vote_matrix = get_group_user_submission_vote_score_matrix(predicted_all_users_submissions_votes, all_users, all_users_submissions_data, existing_votes)
    
    # all_users_preferred_submissions, all_preferred_submissions_ranking = get_group_users_preferred_submissions(all_submission_votes, thres = config["upvote_downvote_ratio_thres"])

Calculate Pearson correlation between users

In [None]:
if config["user_grouping_method"] == "predict_all_submissions":
    debug((pred_all_user_submission_vote_matrix==-1).any())
    debug(pred_all_user_submission_vote_score_matrix=pred_all_user_submission_vote_score_matrix, pred_all_user_submission_vote_matrix=pred_all_user_submission_vote_matrix)
    vote_score_pearson_corr = np.corrcoef(pred_all_user_submission_vote_score_matrix) # (697, 697)
    debug(vote_score_pearson_corr=vote_score_pearson_corr)

#### Perform curation on a subreddit

Select a subreddit from the most popular subreddits

In [None]:
common_subreddits_counts = subreddit_votes_counter.most_common(20)
prompt = []
for subreddit_id, vote_counts in common_subreddits_counts:
    subreddit_name_str = (original_feature_map['SUBREDDIT'][subreddit_id] + ', ') if 'SUBREDDIT' in original_feature_map else ''
    prompt.append(f"Subreddit {subreddit_id}: {subreddit_name_str}{vote_counts} votes")
prompt = "\n".join(prompt)
a_subreddit = input(f"{prompt}\nSelect a subreddit: ")
if 'SUBREDDIT' in original_feature_map:
    a_subreddit = int(a_subreddit)
    subreddit_name_str = (f" ({original_feature_map['SUBREDDIT'][a_subreddit]})")
else:
    subreddit_name_str =  ''
print_log(config["log_path"], f"Selected subreddit: {a_subreddit}{subreddit_name_str}")
a_subreddit_active_users:set = subreddit_active_users[a_subreddit]
print_log(config["log_path"], f"In train data, subreddit {a_subreddit} have {len(a_subreddit_active_users)} active users (who votes >= {active_user_votes_thres} times), {subreddit_votes_counter[a_subreddit]} votes and {len(subreddit_train_submissions[a_subreddit])} unique submissions. In test data, subreddit {a_subreddit} have {len(subreddit_test_submissions[a_subreddit])} unique submissions.") 

[1;33m------------------[0m[1;31m 2022-08-08 07:16:03 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mSelected subreddit: r/funny[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:03 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:03 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mIn train data, subreddit r/funny have 47 active users (who votes >= 5 times), 4722 votes and 1539 unique submissions. In test data, subreddit r/funny have 604 unique submissions.[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:03 [0m[1;33m------------------[0m


In [None]:
user_grouping_method = config["user_grouping_method"]
# user_grouping_method =  "none" # "manual" #TODO: change this
manual_user_groups = config["manual_user_groups"]
# manual_user_groups = {"Conservative": {66, 39, 10, 44, 16, 60}, "Democratic":{0, 65, 64, 37, 49, 52, 20, 22, 23, 26, 29}}
debug(user_grouping_method=user_grouping_method)


[1;33m------------------[0m[1;31m 2022-08-08 07:16:03 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['user_grouping_method'], at [0m[1;32m<ipython-input-10-c7ded1d0119f>:5 <module>[0m[1;33m[0m
[1;33m0 / 3.[0m [1;33m [0m[1;36muser_grouping_method[0m[1;33m str len 5: votes[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:03 [0m[1;33m------------------[0m


In [None]:
import sklearn.decomposition

def get_bool_vec(selected_ids, vec_size):
    bool_vec = torch.zeros([vec_size], dtype = bool)
    for user in selected_ids:
        bool_vec[user] = True
    return bool_vec

def get_user_reps(selected_users, all_user_embedding, train_data:pd.DataFrame = None, selected_submissions = None, user_grouping_method = "neural"):
    assert all_user_embedding is not None
    selected_users_bool_vec = get_bool_vec(selected_users, all_user_embedding.shape[0])
    # user_user_i_map = {}
    selected_user_i_user_map = {}
    user_i = 0
    for user, in_subreddit in enumerate(selected_users_bool_vec):
        if in_subreddit:
            # user_user_i_map[user] = user_i
            selected_user_i_user_map[user_i] = user
            user_i += 1
    # assert len(user_user_i_map) == len(user_i_user_map)
    selected_users_reps = None
    if "neural" in user_grouping_method:
        selected_users_reps = all_user_embedding[selected_users_bool_vec, :]
    elif "votes" in user_grouping_method:
        assert train_data is not None and selected_submissions is not None
        sub_sub_i_map = {sub: sub_i for sub_i, sub in enumerate(list(selected_submissions.keys()))}
        users_reps = torch.zeros([all_user_embedding.shape[0], len(selected_submissions)])
        for row_i, row in train_data.iterrows():
            if row["USERNAME"] in selected_users and row["SUBMISSION_ID"] in selected_submissions:
                vote = 1 if row["VOTE"] == 1 else -1
                users_reps[row["USERNAME"], sub_sub_i_map[row["SUBMISSION_ID"]]] = vote
        selected_users_reps = users_reps[selected_users_bool_vec, :]
        users_vote_sum = (selected_users_reps * selected_users_reps).sum(axis = -1, keepdim= True)
        assert (users_vote_sum != 0).all()
        # selected_users_reps = selected_users_reps / users_vote_sum # average votes on each submission
        debug(selected_users_reps_before_PCA=selected_users_reps.shape)
        pca_solver = sklearn.decomposition.PCA(n_components=0.95)
        selected_users_reps = pca_solver.fit_transform(selected_users_reps)
        debug(selected_users_reps_after_PCA = selected_users_reps.shape)

    return selected_users_reps, selected_user_i_user_map


##### Obtain representations for active users

User representation will be used to cluster users into groups if `user_grouping_method` is "`neural`" or "`vote`"

Either use this one... (clustering using vote prediction score on submissions in this subreddit, make sure `pred_all_user_submission_vote_score_matrix` is available)

In [None]:
if pred_all_user_submission_vote_score_matrix is not None:
    debug(pred_all_user_submission_vote_score_matrix=pred_all_user_submission_vote_score_matrix)
    subreddit_submissions_bool_vec = get_bool_vec(subreddit_test_submissions[a_subreddit].keys(), pred_all_user_submission_vote_score_matrix.shape[1])
    a_subreddit_active_users_reps, a_subreddit_active_user_i_user_map = get_user_reps(a_subreddit_active_users, all_user_embedding=pred_all_user_submission_vote_score_matrix[:, subreddit_submissions_bool_vec], train_data=train_data, selected_submissions = subreddit_train_submissions[a_subreddit], method = user_grouping_method)

Or this one... (cluster using user_embedding or sparse actual votes)

In [None]:
if pred_all_user_submission_vote_score_matrix is None:
    all_username_tokens = [f"USERNAME_{user_i}" for user_i in all_users]
    all_username_token_ids = torch.tensor(model.tokenizer.convert_tokens_to_ids(all_username_tokens))
    all_username_token_ids = all_username_token_ids.to(model.device); model = model.to(model.device)
    with torch.no_grad():
        user_embedding = model.lm_encoder.embeddings.word_embeddings(all_username_token_ids)
    # debug(all_username_tokens=all_username_tokens, all_username_token_ids=all_username_token_ids, user_embedding=user_embedding)
    a_subreddit_active_users_reps, a_subreddit_active_user_i_user_map = get_user_reps(a_subreddit_active_users, all_user_embedding=user_embedding, train_data=train_data, selected_submissions = subreddit_train_submissions[a_subreddit], user_grouping_method = user_grouping_method)
    debug(a_subreddit_active_users_reps=a_subreddit_active_users_reps)

[1;33m------------------[0m[1;31m 2022-08-08 07:16:22 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['selected_users_reps_before_PCA'], at [0m[1;32m<ipython-input-11-ebd605a469f0>:36 get_user_reps[0m[1;33m[0m
[1;33m0 / 4.[0m [1;33m [0m[1;36mselected_users_reps_before_PCA[0m[1;33m torch.Size with val:  torch.Size([47, 1539])[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:22 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:22 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['selected_users_reps_after_PCA'], at [0m[1;32m<ipython-input-11-ebd605a469f0>:39 get_user_reps[0m[1;33m[0m
[1;33m0 / 5.[0m [1;33m [0m[1;36mselected_users_reps_after_PCA[0m[1;33m tuple size: 2[0m[1;33m (..)[0m
[1;33m    [0m[1;36mitem 0: [0m[1;33m num val: 47[0m
[1;33m    [0m[1;36mitem 1: [0m[1;33m num val: 41[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:22

##### Cluster active users into multiple groups

In [None]:
def get_user_groups(selected_users_reps, selected_user_i_user_map:dict, user_grouping_method = "neural", existing_user_votes=None, manual_user_groups=None, train_data=None):
    group_centers = None
    users_in_groups = defaultdict(set)
    
    if "manual" in user_grouping_method:
        if manual_user_groups is not None:
            users_in_groups.update(manual_user_groups)
        else:
            user_preferences = defaultdict(dict)
            usernames = train_data["USERNAME"].to_list()
            votes = train_data["VOTE"].to_list()
            submissions_text = train_data["SUBMISSION_TEXT"].to_list()
            for row_i, username in enumerate(usernames):
                vote = votes[row_i]
                submission_text = submissions_text[row_i]
                if vote not in user_preferences[username]:
                    user_preferences[username][vote] = []
                user_preferences[username][vote].append(submission_text)
            all_users = list(user_preferences.keys())
            for _ in range(20):
                possible_user = random.choice(all_users)
                if len(input(f"Include user {possible_user}? \n{user_preferences[possible_user]}")) > 0:
                    users_in_groups[len(users_in_groups)] = possible_user
            
    if "neural" in user_grouping_method or "votes" in user_grouping_method:
        n_groups = int(len(selected_user_i_user_map) / 5) # TODO: change how many users in a group
        debug(num_selected_users = len(selected_user_i_user_map), n_groups=n_groups) # n_groups: 118
        debug("Begin grouping...")
        from sklearn.cluster import KMeans
        grouping = KMeans(n_clusters = n_groups, random_state = 42, verbose = 0).fit(selected_users_reps)
        group_centers = grouping.cluster_centers_
        """
        from sklearn.cluster import AgglomerativeClustering
        grouping = AgglomerativeClustering(linkage = "complete").fit(selected_users_reps)
        """
        """
        from sklearn.cluster import SpectralClustering
        grouping = SpectralClustering(n_groups, random_state = 42, verbose = 0).fit(selected_users_reps)
        """
        labels = grouping.labels_ # grouping.labels_: [584 350 948 ... 813 938 152]
        """
        from sklearn.mixture import GaussianMixture
        labels = GaussianMixture(n_groups, random_state = 42, verbose = 0).fit_predict(selected_users_reps)
        """
        usernames_in_groups = defaultdict(set)
        for user_i, group_x in enumerate(labels): 
            users_in_groups[group_x].add(selected_user_i_user_map[user_i])
            usernames_in_groups[group_x].add(original_feature_map["USERNAME"][selected_user_i_user_map[user_i]])
        debug(group_user_num=str({group_x: len(users_in_groups[group_x]) for group_x in users_in_groups}))
        debug(usernames_in_groups=str(usernames_in_groups))
        
    if  "single_user_as_group" in user_grouping_method or  "all_user_as_group" in user_grouping_method:
        assert existing_user_votes is not None
        all_users = list(selected_user_i_user_map.values())
        all_users.sort(key=lambda x:existing_user_votes[x])
        if  "single_user_as_group" in user_grouping_method:
            all_users = all_users[:10] + all_users[-10:]
            for i, user in enumerate(all_users):
                users_in_groups[i] = {user}
            # users_in_groups = {i: {user} for i,user in selected_user_i_user_map.items()}
        if  "all_user_as_group" in user_grouping_method:
            users_in_groups[len(users_in_groups)] = all_users
    
    if "none" in user_grouping_method:
        users_in_groups[len(users_in_groups)] = []
            
    return users_in_groups, group_centers

if type(a_subreddit_active_users_reps) == torch.tensor: a_subreddit_active_users_reps = a_subreddit_active_users_reps.cpu()
users_in_groups, group_centers = get_user_groups(a_subreddit_active_users_reps, a_subreddit_active_user_i_user_map, user_grouping_method=user_grouping_method, existing_user_votes=existing_user_votes, manual_user_groups=manual_user_groups, train_data = train_data)

[1;33m------------------[0m[1;31m 2022-08-08 07:16:23 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 2 vars: ['num_selected_users', 'n_groups'], at [0m[1;32m<ipython-input-14-b734051e9167>:27 get_user_groups[0m[1;33m[0m
[1;33m0 / 7.[0m [1;33m [0m[1;36mnum_selected_users[0m[1;33m num val: 47[0m
[1;33m1 / 8.[0m [1;33m [0m[1;36mn_groups[0m[1;33m num val: 9[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:23 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:23 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m<ipython-input-14-b734051e9167>:28 get_user_groups[0m[1;33m[0m
[1;33m[0m[1;36mBegin grouping...[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:23 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:16:23 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['group_user_num'],

##### Predict preferred submissions of each group

Select submissions to curate. Test submissions in this Subreddit or customized submission

In [None]:
if config["submission_source"] == "test_data":
    submissions_before_curation:dict = subreddit_test_submissions[a_subreddit]
elif config["submission_source"] == "custom":
    custom_submission_id = "custom_" + str(random.randint(1000000000000000, 9999999999999999))
    custom_username = 49562 # TODO:
    custom_title = input("Input post title: ") # 'Employee of the year' 'This is so funny!!!'
    custom_content = input("Input post content: ")
    custom_submission = pd.Series({
        'SUBMISSION_ID': custom_submission_id,
        'SUBREDDIT': a_subreddit,
        'CREATED_TIME': re.sub("[0-9][0-9]:[0-9][0-9]:[0-9][0-9] ", "", time.ctime(time.time())),
        'USERNAME': 9271, # TODO:
        'VOTE': 1.0,
        'TITLE': custom_title,
        'AUTHOR': custom_username,
        '#_COMMENTS': 0,
        'NSFW': 'false',
        'SCORE': 0,
        'UPVOTED_%': 0.5,
        'LINK': '',
        'SUBMISSION_TEXT': (custom_title + " [SEP] " + custom_content) if custom_content != "" else custom_title,
        'UPVOTED_USERS': [], # TODO:
        'DOWNVOTED_USERS': []}
    )
    submissions_before_curation = {custom_submission_id: custom_submission}
    print(submissions_before_curation)

{'custom_7913210495806781': SUBMISSION_ID      custom_7913210495806781
SUBREDDIT                          r/funny
CREATED_TIME               Mon Aug  8 2022
USERNAME                              9271
VOTE                                     1
TITLE                                   hi
AUTHOR                               49562
#_COMMENTS                               0
NSFW                                 false
SCORE                                    0
UPVOTED_%                              0.5
LINK                                      
SUBMISSION_TEXT                         hi
UPVOTED_USERS                           []
DOWNVOTED_USERS                         []
dtype: object}


Print original submissions

In [None]:
# submission_text_map = test_data[["SUBMISSION_ID", "SUBMISSION_TEXT"]].drop_duplicates("SUBMISSION_ID").set_index("SUBMISSION_ID").to_dict()['SUBMISSION_TEXT']

def get_submissions_text(submission_ids):
    submissions_text = []
    for submission_id in submission_ids:
        # if "SUBMISSION_ID" in original_feature_map:
        #     submission_id = original_feature_map["SUBMISSION_ID"][submission_id]
        # submissions_text.append(submission_text_map[submission_id])
        submissions_text.append(submissions_before_curation[submission_id]["SUBMISSION_TEXT"])
        
    return submissions_text

print(get_submissions_text(submissions_before_curation))

['hi']


Set parameters and prepare

In [None]:
pred_group_votes_info = {}
batch_size = 1024
# thres = config["upvote_downvote_ratio_thres"]
thres = 0.5 # TODO:

Predict and show their relationship using venn.

In [None]:

def predict_groups_preferences(users_in_groups, unique_submissions:dict, thres = 0.5, group_centers=None, user_grouping_method = "rep", existing_votes = None, existing_user_updown_votes = None, pred_group_votes_info = None):
    # users_in_groups = existing_user_votes.most_common(3)
    groups_preferred_submissions = {}
    groups_preferred_submissions_text = {}
    groups_submission_upvote_count_matrix = np.zeros([len(users_in_groups), len(unique_submissions)])
    unique_submissions_ids = list(unique_submissions.keys())
    used_group_centers = []
    if os.path.exists(config["preferred_submissions_venn_figure_dir"]):
        shutil.rmtree(config["preferred_submissions_venn_figure_dir"])
    os.makedirs(config["preferred_submissions_venn_figure_dir"], exist_ok=True)
    for group_x in users_in_groups:
        if ((not ("single_user_as_group" in user_grouping_method and len(users_in_groups[group_x]) == 1)) and (not ("none" in user_grouping_method and len(users_in_groups[group_x]) == 0)) and len(users_in_groups[group_x]) <= config["group_user_num_lower_thres"]) or ("all_user_as_group" not in user_grouping_method and len(users_in_groups[group_x]) > config["group_user_num_upper_thres"]): # keep middle sized centers
            continue
        if group_centers is not None: # only keep not similar centers
            group_x_center = group_centers[group_x]
            similar_center = False
            for center in used_group_centers:
                if np.dot(group_x_center, center) > 0:
                    similar_center = True
                    break
            if similar_center:
                # continue
                print_log(config["log_path"], "Have similar center with existing group")
            used_group_centers.append(group_x_center)

        ################ predicting votes of some users and some submissions ##################
        print_log(config["log_path"], f"Predicting group {group_x} with users {users_in_groups[group_x]}")
        
        if group_x not in pred_group_votes_info and len(users_in_groups[group_x]) > 0:
            group_x_subreddit_submissions_data = convert_group_users_subreddit_submissions_data(users_in_groups[group_x], unique_submissions)
            predicted_group_x_submissions_votes = predict_group_users_submissions_votes(model, group_x_subreddit_submissions_data, batch_size)
            group_x_submission_votes, group_x_confidence, pred_group_x_subreddit_submission_vote_score_matrix, pred_group_x_subreddit_submission_vote_matrix = get_group_user_submission_vote_score_matrix(predicted_group_x_submissions_votes, users_in_groups[group_x], group_x_subreddit_submissions_data, existing_votes)
            
            pred_group_votes_info[group_x] = (group_x_submission_votes, group_x_confidence, pred_group_x_subreddit_submission_vote_score_matrix, pred_group_x_subreddit_submission_vote_matrix)
        elif len(users_in_groups[group_x]) == 0: # no user in this group, i.e., without curation, just count real votes
            group_x_submission_votes, group_x_confidence, pred_group_x_subreddit_submission_vote_score_matrix, pred_group_x_subreddit_submission_vote_matrix = get_group_users_real_vote(list(a_subreddit_active_user_i_user_map.values()), unique_submissions, existing_votes, metric = "upvote_rate")
        else:
            debug("Using existing pred_group_votes_info")
            group_x_submission_votes, group_x_confidence, pred_group_x_subreddit_submission_vote_score_matrix, pred_group_x_subreddit_submission_vote_matrix = pred_group_votes_info[group_x]
        
        
        group_x_preferred_submissions, group_x_preferred_submissions_ranking = get_group_users_preferred_submissions(group_x_submission_votes, thres = thres)
    
        pred_group_x_subreddit_submission_vote_matrix_np = pred_group_x_subreddit_submission_vote_matrix[unique_submissions_ids].to_numpy()
        pred_group_x_subreddit_submission_vote_matrix_np = pred_group_x_subreddit_submission_vote_matrix_np[pred_group_x_subreddit_submission_vote_matrix_np.sum(axis=1) >= 0]
        groups_submission_upvote_count_matrix[group_x] = pred_group_x_subreddit_submission_vote_matrix_np.sum(axis=0).astype(float)/len(pred_group_x_subreddit_submission_vote_matrix_np)
        

        ################# Display submissions preferred by each group of users ######################

        if "single_user_as_group" in user_grouping_method:
            user_train_vote_prompt = f"voted {existing_user_updown_votes[list(users_in_groups[group_x])[0]]} in training data, prediction confidence {list(group_x_confidence.values())[0]}, "
        else:
            user_train_vote_prompt = ""
            
        group_x_preferred_ranked_submissions = group_x_preferred_submissions_ranking[:len(group_x_preferred_submissions)]
        groups_preferred_submissions[f"Group {group_x}"] = group_x_preferred_submissions
        
        # convert submission text content
        group_x_preferred_ranked_submissions_text = get_submissions_text(group_x_preferred_ranked_submissions)
        groups_preferred_submissions_text[f"Group {group_x}"] = group_x_preferred_ranked_submissions_text
            
        print_log(config["log_path"], f"Users in group {group_x} {user_train_vote_prompt}prefers {len(group_x_preferred_submissions)}/{len(unique_submissions)} submissions (%upvotes ≥ {thres}) (sorted using %upvotes): {group_x_preferred_ranked_submissions}, with text {group_x_preferred_ranked_submissions_text}")

        # draw venn diagram            
        if len(groups_preferred_submissions) > 1 and len(groups_preferred_submissions) <=6 and sum([len(_) for _ in groups_preferred_submissions.values()]) > 0:
            ax = venn(groups_preferred_submissions) if len(groups_preferred_submissions) <=5 else pseudovenn(groups_preferred_submissions)
            plt.show()
            figure_path = f"{config['preferred_submissions_venn_figure_dir']}/{len(groups_preferred_submissions)}_groups.png"
            ax.figure.savefig(figure_path)
            debug(f"Figure saved in {figure_path}")
    return groups_preferred_submissions, groups_preferred_submissions_text, groups_submission_upvote_count_matrix

model = model.to(model.device); model.eval()
groups_preferred_submissions, groups_preferred_submissions_text, groups_submission_upvote_count_matrix = predict_groups_preferences(users_in_groups, submissions_before_curation, group_centers=group_centers, user_grouping_method=user_grouping_method, existing_votes=existing_votes, existing_user_updown_votes=existing_user_updown_votes, pred_group_votes_info = pred_group_votes_info, thres = thres)
if config["submission_source"] == "custom":
    debug(groups_submission_upvote_count_matrix)
    print(f"You can post immediately in {[group_x for group_x in groups_preferred_submissions if len(groups_preferred_submissions[group_x]) > 0]}. You will need to wait for more votes to post in {[group_x for group_x in groups_preferred_submissions if len(groups_preferred_submissions[group_x]) == 0]} -- your post will stay in the background first. Currently, ", end = "")
    for group_x in groups_preferred_submissions:
        group_x_int = int(group_x.split(" ")[-1])
        print(f"{100 * groups_submission_upvote_count_matrix[group_x_int, 0]}% of the curators in {group_x} are predicted to upvote on your post; ", end = "")
        


[1;33m------------------[0m[1;31m 2022-08-08 07:22:23 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mPredicting group 1 with users {26240, 7811, 27395, 31107, 14601, 7563, 15381, 17190, 8617, 8757, 25272, 26812, 30407, 2507, 7509, 23254, 2904, 20313, 1501, 21985, 6629, 9067, 18539, 21103, 2676, 23669, 27384, 13436}[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:23 [0m[1;33m------------------[0m


  0%|          | 0/28 [00:00<?, ?it/s]

[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['sample_voted_users'], at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/dynamic_data.py:15 __init__[0m[1;33m[0m
[1;33m0 / 27.[0m [1;33m [0m[1;36msample_voted_users[0m[1;33m bool: False[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m


100%|██████████| 1/1 [00:00<00:00, 80.25it/s]


  0%|          | 0/28 [00:00<?, ?it/s]

[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mUsers in group 1 prefers 0/1 submissions (%upvotes ≥ 0.5) (sorted using %upvotes): [], with text [][0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mPredicting group 8 with users {13088, 29313, 20552, 22410, 14290, 19539, 12155, 24350}[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m


  0%|          | 0/8 [00:00<?, ?it/s]

[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['sample_voted_users'], at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/dynamic_data.py:15 __init__[0m[1;33m[0m
[1;33m0 / 28.[0m [1;33m [0m[1;36msample_voted_users[0m[1;33m bool: False[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m


100%|██████████| 1/1 [00:00<00:00, 107.52it/s]


  0%|          | 0/8 [00:00<?, ?it/s]

[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mUsers in group 8 prefers 0/1 submissions (%upvotes ≥ 0.5) (sorted using %upvotes): [], with text [][0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mPredicting group 5 with users {27461, 25042, 15029, 21211, 27548}[0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m


  0%|          | 0/5 [00:00<?, ?it/s]

[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['sample_voted_users'], at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/dynamic_data.py:15 __init__[0m[1;33m[0m
[1;33m0 / 29.[0m [1;33m [0m[1;36msample_voted_users[0m[1;33m bool: False[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m


100%|██████████| 1/1 [00:00<00:00, 111.28it/s]


  0%|          | 0/5 [00:00<?, ?it/s]

[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m at [0m[1;32m/home/TableSense/largedisk/wanrong/Curation-Modeling/utils.py:123 print_log[0m[1;33m[0m
[1;33m[0m[1;36mUsers in group 5 prefers 0/1 submissions (%upvotes ≥ 0.5) (sorted using %upvotes): [], with text [][0m[1;33m[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
[1;33m[0m[1;32mDEBUG:[0m[1;33m 1 vars: ['?'], at [0m[1;32m<ipython-input-47-651a2459889b>:77 <module>[0m[1;33m[0m
[1;33m0 / 30.[0m [1;33m [0m[1;36m?[0m[1;33m ndarray size: (9, 1) val: [[0.  ]
 [0.25]
 [0.  ]
 [0.  ]
 [0.  ]
 [0.2 ]
 [0.  ]
 [0.  ]
 [0.25]][0m
[1;33m------------------[0m[1;31m 2022-08-08 07:22:24 [0m[1;33m------------------[0m
You can post immediately in []. You will need to wait for more votes to post in ['Gr

##### Calculate Pearson correlation of different groups

In [None]:
# print(groups_submission_upvote_count_matrix)
groups_submission_upvote_count_matrix_nonzero = groups_submission_upvote_count_matrix[groups_submission_upvote_count_matrix.sum(axis = 1) != 0]
group_preference_pearson_corr = np.corrcoef(groups_submission_upvote_count_matrix_nonzero) # (697, 697)
print(group_preference_pearson_corr)

[[nan nan nan]
 [nan nan nan]
 [nan nan nan]]


In [None]:
from google.cloud import language_v1
import multiprocessing
client = language_v1.LanguageServiceClient()

def post_analysis(content):
    document = language_v1.Document(
        content=content, type_=language_v1.Document.Type.PLAIN_TEXT
    )
    sentiment_score = client.analyze_sentiment(document=document).document_sentiment.score
    content_classes = [cls.name for cls in client.classify_text(document=document).categories]
    return (sentiment_score, content_classes)

def post_analysis_batch(contents:list):
    cores = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=cores)
    return pool.map(post_analysis, contents)