Preprocesses the Pushshfit comments data to the Community2Vec usable format, removing deleted comments/users and a specified percentage of the top most active users (naive bot removal).

In [0]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)
# Turn on logging for ihop module to see loss per epoch
logger = logging.getLogger('ihop')
logger.setLevel(logging.INFO)


In [0]:
#pushshift_json = "/mnt/s3bucket/community2vec/raw_data/sample_data.bz2"
pushshift_json = "/mnt/s3bucket/community2vec/raw_data/RC_2021-06.bz2"
pushshift_filename = pushshift_json.split("/")[-1]

stamp = "01282022"
spark_mnt_path = "/mnt/s3bucket/community2vec"
dbfs_mnt_path = "/dbfs" + spark_mnt_path
analogies_path = f"{dbfs_mnt_path}/subreddit_analogies.txt"


In [0]:
import ihop.community2vec as c2v

import os 
import itertools

# Track best analogy accuracy and results
best_analogy_acc = 0.0
best_model_path = None
# Map model identifer to accuracy value
analogy_results = []

# Fairly course grained hyperparam tuning values
learning_rates = [0.05, 0.01]
dimensions = [150]
samples = [0.005, 0.05]
negative_sampling_vals = [20, 40]

epochs = 5
workers = 36

# Try several different values for excluding percentage of top active users
for exclude_top_user_perc in [0.0, 0.02, 0.05, 0.10]:
    
    data_filename = "_".join([pushshift_filename.split(".")[0], f"{int(exclude_top_user_perc*100)}percentTopUsersExcluded", stamp])
    out_dirname = f"{dbfs_mnt_path}/{data_filename}/models"
    subreddit_counts_csv = f"{dbfs_mnt_path}/{data_filename}/subreddit_counts.csv"
    spark_user_contexts = f"{spark_mnt_path}/{data_filename}/user_contexts"
    user_contexts_path = f"{dbfs_mnt_path}/{data_filename}/user_contexts"
    vocab = c2v.get_vocabulary(subreddit_counts_csv)
    
    if not (os.path.exists(out_dirname) and os.path.isdir(out_dirname)):
        os.mkdir(out_dirname)
    
    for (lr, dim, s, nsv) in itertools.product(learning_rates, dimensions, samples, negative_sampling_vals):
        model_identifier = f"{out_dirname}/c2v_lr{lr}_dim{dim}_sample{s}_negative{nsv}"
    
        if not (os.path.exists(model_identifier) and os.path.isdir(model_identifier)):
            os.mkdir(model_identifier)
            
        # Train model 
        print("Training model:", model_identifier)
        c2v_model = c2v.GensimCommunity2Vec.init_with_spark(spark, vocab, spark_user_contexts,
                                                            vector_size = dim, 
                                                            negative=nsv,
                                                            sample = s,
                                                            alpha=lr, 
                                                            epochs = epochs,
                                                            workers = workers)
        # Change to local context path
        c2v_model.contexts_path = user_contexts_path
        c2v_model.train(epoch_analogies=False)
        model_save_path = f"{model_identifier}"
        vector_save_path = f"{model_identifier}/keyedVectors"
        print("Saving model to", model_save_path)
        c2v_model.save(model_save_path)
        c2v_model.save_vectors(vector_save_path)
        
        acc, detailed_accs = c2v_model.score_analogies(analogies_path)
        print("Analogy results:")
        print("Overall Accuracy:", acc)
        print("Detailed analogy results:")
        for dr in detailed_accs:
            section_correct = len(dr['correct'])
            total_section_examples = section_correct + len(dr['incorrect'])
            print("\t", dr['section'],":", section_correct, "/", total_section_examples  )
            
        # Useful for plotting and comparing results
        analogy_results.append({"model_id": model_identifier, "top_users_excluded_percent": exclude_top_user_perc, 
                               "learning_rate": lr, "dimensions": dim, "sample":s, "negative":nsv, "analogy_accuracy": acc})
        
        if acc > best_analogy_acc:
            print("New best analogy acc")
            best_model_path = model_identifier
            best_analogy_acc = acc
        
        print()
    

In [0]:
import pandas as pd
accs_df = pd.DataFrame.from_records(analogy_results)
display(accs_df)
