# ⚙️ Install libraries and download dataset

After this step, the runtime must be restarted.

In [None]:
# https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install simplet5 evaluate sacrebleu tqdm spacy matplotlib openai zenodo-get bert_score tensorflow

!zenodo_get 10.5281/zenodo.8023142

# 🏋️ Train evaluation models

May take a while

In [None]:
import os
import json
import pandas
import datasets
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, \
    AutoModelForCausalLM


trainings_epochs = 5

model_name = 'gpt2' # base-model for fine-tuning

min_perplexity = 100 # minimal perplexity for trainings comments

data_dir = 'trainings_data'

use_submisson_data = False

csv_comments_path = 'reddit_comments.csv'

eval_models_dir = 'eval_models'

trainings_data_dir = 'trainings_data'

evaluation_models = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
      "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes", "politics",
      "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]


def create_trainings_data(subreddit_name, eval_split=0.8):
    """
    Generates text files that are used for training the evaluation model. A training file with the naming
    training_[subredditname].txt and an evaluation file with the naming eval_[subredditname].txt are created.
    :param subreddit_name: Name of the subreddit for which an evaluation model is to be trained
    :type subreddit_name: str
    :param eval_split: Training split size. By default 0.8
    :type eval_split float
    :param data_dir: Directory in which the trainings data should be saved
    :type data_dir: str
    """
    texts = []
    comment_data = pandas.io.parsers.read_csv(csv_comments_path)
    if use_submisson_data:
        submission_data = pandas.io.parsers.read_csv(csv_comments_path)
    query = '`subreddit` == ' + '"' + subreddit_name + '"'
    comments_data_subreddit = comment_data.query(query)
    if use_submisson_data:
        submission_data_subreddit = submission_data.query(query)
    number_of_comments = 1
    for index, row in comments_data_subreddit.iterrows():
        if row['body'] == "[deleted]":
            continue
        if row['perplexity'] < min_perplexity:
            continue
        number_of_comments += 1
        texts.append(row['body'].strip() + ' <|endoftext|>')
    if use_submisson_data:
        for index, row in submission_data_subreddit.iterrows():
            texts.append(row['title'].strip() + ' <|endoftext|>')
    # split data in training and eval
    split_index = int(len(texts) * eval_split)
    trainings_data = texts[:split_index]
    eval_data = texts[split_index:]
    # write to file
    training_text = ""
    for sentence in trainings_data:
        training_text += sentence + "\n"
    with open(f"{data_dir}/training_{subreddit_name}.txt", "w") as outfile:
        outfile.write(training_text)
    eval_text = ""
    for sentence in eval_data:
        eval_text += sentence + "\n"
    with open(f"{data_dir}/eval_{subreddit_name}.txt", "w") as outfile:
        outfile.write(eval_text)
    return number_of_comments


def train_model(subreddit_name, model_dir=eval_models_dir):
    """
    Train a model for the given subreddit
    :param subreddit_name: Name of the subreddit for which an evaluation model is to be trained
    :param data_dir: Folder in which the training data is located
    :param model_dir: Folder in which the models should be saved
    """
    train_path = f"{data_dir}/training_{subreddit_name}.txt"
    eval_path = f"{data_dir}/eval_{subreddit_name}.txt"

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # currently only models that support the gpt2 tokenizer can be used
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    output_dir = f"./{model_dir}/{subreddit_name}"
    # create output folder if it not exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,  # The output directory
        overwrite_output_dir=True,  # overwrite the content of the output directory
        num_train_epochs=trainings_epochs,  # number of training epochs
        per_device_train_batch_size=4,  # batch size for training
        per_device_eval_batch_size=4,  # batch size for evaluation
        eval_steps=400,  # Number of update steps between two evaluations.
        save_steps=800,  # after # steps model is saved
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
    )

    dataset = datasets.load_dataset("text", data_files={"train": train_path, "test": eval_path}, sample_by="line")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    def tokenize_function(examples):
        # Remove empty lines
        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        num_proc=4,
        remove_columns=["text"],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"]
    )
    trainer.train()
    trainer.save_model()


# create folders that will be used later in the process
if not os.path.exists(eval_models_dir):
    os.makedirs(eval_models_dir)
if not os.path.exists(trainings_data_dir):
    os.makedirs(trainings_data_dir)

# create the trainings and evaluation data for the evaluation models
training_data_bar = tqdm(evaluation_models)
data_stats = {}
for subreddit in training_data_bar:
    training_data_bar.set_description(f"Processing {subreddit}")
    data_stats[subreddit] = create_trainings_data(subreddit_name=subreddit)
print(f"Data distribution {str(data_stats)}")


# train the evaluation models
training_model_bar = tqdm(evaluation_models)
for subreddit in tqdm(evaluation_models):
    training_model_bar.set_description(f"Training model for {subreddit}")
    train_model(subreddit_name=subreddit)


# 🧮 Calculate some small info about the dataset

In [None]:
import statistics
import pandas

MIN_PERPLEXITY = 100

data = pandas.io.parsers.read_csv("reddit_comments.csv")
data['perplexity'] = data.perplexity.astype(float)
data = data.query(f"perplexity > {MIN_PERPLEXITY}")

comments_lengths = []
for index, row in data.iterrows():
    comments_lengths.append(len(row.body))

print('Median comment length of filtered comments: ' + str(statistics.median(comments_lengths)))

data_grouped = data.groupby('subreddit')
print(f"Overall size: {len(data)}")
print(f"Group sizes: {data_grouped.size()}")

# 👷 Prepare the evaluation

In [7]:
import json
import math

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


def get_perplexity(model, encodings):
    max_length = model.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over input tokens.
            # Multiply it with trg_len to get the summation instead of average.
            # We will take average over all the tokens to get the true average
            # in the last step of this example.
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    return torch.exp(torch.stack(nlls).sum() / end_loc).item()


def get_device():
    # check if cuda is available and set device
    if torch.cuda.is_available():
        return torch.device('cuda')
    return torch.device('cpu')


class StyleSpecificPerplexity:

    def __init__(self):
        # load config
        self.config = {"evaluation_models": ["antiwork", "atheism",
                       "Conservative", "conspiracy", "dankmemes", "gaybros",
                       "leagueoflegends", "lgbt", "Libertarian", "linguistics",
                       "MensRights", "news", "offbeat", "PoliticalCompassMemes",
                       "politics", "teenagers", "TrueReddit", "TwoXChromosomes",
                       "wallstreetbets", "worldnews"],
                       "eval_model_dir": 'eval_models'}
        self.models = {}
        for model_name in self.config["evaluation_models"]:
            device = get_device()
            # load tokenizer an model
            model = AutoModelForCausalLM.from_pretrained(f"{self.config['eval_model_dir']}/{model_name}").to(device)
            tokenizer = AutoTokenizer.from_pretrained("gpt2")
            tokenizer.pad_token = tokenizer.eos_token
            self.models[model_name] = {"model": model, "tokenizer": tokenizer}

    def calculate_perplexity(self, model_name, input_texts=[]):
        if model_name not in self.models:
            raise Exception(f"No model for style {model_name}")

        # load model and tokenizer
        model = self.models[model_name]["model"]
        tokenizer = self.models[model_name]["tokenizer"]

        perplexities = []
        for input_text in input_texts:
            # encode text
            device = get_device()
            encodings = tokenizer(input_text, return_tensors="pt").to(device)
            # calculate perplexity
            perplexity = get_perplexity(model, encodings)

            # If no perplexity could be calculated (for example because the input contains only one word),
            # do not save. Result would be NaN. This means that no more total perplexity can be calculated.
            if math.isnan(perplexity):
                continue

            perplexities.append(perplexity)
        # return perplexity values
        return perplexities


# ⚗️ Evaluate dataset lemma overlap and perplexity

In [None]:
import os
import json
import multiprocessing

import pandas
import spacy
import numpy as np
from tqdm import tqdm

min_perplexity = 100 # minimal perplexity for trainings comments

data_dir = 'trainings_data'

use_submisson_data = False

csv_comments_path = 'reddit_comments.csv'

eval_models_dir = 'eval_models'

result_path = 'dataset_eval_results'

trainings_data_dir = 'trainings_data'

evaluation_models = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
      "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes", "politics",
      "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]

if not os.path.exists(result_path):
  os.makedirs(result_path)

data = pandas.io.parsers.read_csv(csv_comments_path)
data['perplexity'] = data.perplexity.astype(float)

data = data.query(f"perplexity > {float(min_perplexity)}")
# for n-grams, default are bigrams
n = 2

calculate_overlap = True
calculate_perplexities = True

# Calculate overlapping tokens between the subreddits
nlp = spacy.load("en_core_web_sm")


def preprocess(text: str):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    ngrams = []
    for i in range(len(doc) - n + 1):
        ngram = " ".join([token.text for token in doc[i:i + n]])
        ngrams.append(ngram)
    return {'tokens': tokens, 'ngrams': ngrams}


def preprocess_parallel(data_chunk):
    data_chunk['result'] = data_chunk['body'].apply(preprocess)
    return data_chunk


if calculate_overlap:
    # Split the data into chunks
    num_processes = multiprocessing.cpu_count()
    chunks = np.array_split(data, num_processes)

    # Create a pool of workers to execute the preprocess function in parallel
    pool = multiprocessing.Pool(processes=num_processes)

    # Preprocess the data chunks in parallel
    processed_chunks = pool.map(preprocess_parallel, chunks)

    # Concatenate the processed chunks back into a single dataframe
    processed_data = pandas.concat(processed_chunks)

    # Group the processed data by subreddit
    subreddit_result = processed_data.groupby('subreddit')['result'].apply(list).to_dict()

    for subreddit1, results1 in tqdm(subreddit_result.items()):
        flat_tokens1 = []
        flat_ngrams1 = []
        for res in results1:
            # print(res)
            flat_tokens1.extend(res['tokens'])
            flat_ngrams1.extend(res['ngrams'])

        # print(flat_tokens1)

        unique_tokens1 = set(flat_tokens1)
        unique_ngrams1 = set(flat_ngrams1)
        for subreddit2, results2 in subreddit_result.items():
            flat_tokens2 = []
            flat_ngrams2 = []
            for res in results2:
                flat_tokens2.extend(res['tokens'])
                flat_ngrams2.extend(res['ngrams'])

            unique_tokens2 = set(flat_tokens2)
            unique_ngrams2 = set(flat_ngrams2)
            # Create filename
            filename = f"{result_path}/{subreddit1}_{subreddit2}_overlap.json"

            # Check if file already exists -> skip calculation if file already exists
            if os.path.isfile(filename):
                print(f"Overlap for {subreddit1} {subreddit2} already exits. Skip this combination.")
                continue

            # Calculate overlap
            overlap_tokens = unique_tokens1.intersection(unique_tokens2)
            overlap_ngrams = unique_ngrams1.intersection(unique_ngrams2)

            result_object = {'subreddit1': subreddit1, 'subreddit2': subreddit2,
                             'overlap_tokens': len(overlap_tokens),
                             'overlap_ngrams': len(overlap_ngrams),
                             'subreddit1_total_tokens': len(unique_tokens1),
                             'subreddit2_total_tokens': len(unique_tokens2),
                             'subreddit1_total_ngrams': len(unique_ngrams1),
                             'subreddit2_total_ngrams': len(unique_ngrams2)}

            # Save the result
            with open(filename, "w") as r:
                json.dump(result_object, r)


# calculate the perplexity of models trained on subreddits against content from other subreddits
style_specific_perplexity = StyleSpecificPerplexity()

if calculate_perplexities:
    # Define a function to calculate perplexity
    def calculate_perplexity(model_name, subreddit_data):
        perplexity_subreddit = style_specific_perplexity.calculate_perplexity(model_name, subreddit_data)
        return perplexity_subreddit


    # Loop through each subreddit
    for subreddit in tqdm(data['subreddit'].unique()):
        # for subreddit in tqdm(config['evaluation_models']):
        # Get the comments for the current subreddit
        comments = data[data['subreddit'] == subreddit]['body'].tolist()

        # Loop through each other subreddit
        for other_subreddit in data['subreddit'].unique():

            if subreddit in evaluation_models:

                # Create filename
                filename = f"{result_path}/{subreddit}_{other_subreddit}_perplexity.json"

                # Check if file already exists -> skip calculation if file already exists
                if os.path.isfile(filename):
                    print(f"Perplexity for {subreddit} {other_subreddit} already exits. Skip this combination.")
                    continue

                # Get the comments for the other subreddit
                other_comments = data[data['subreddit'] == other_subreddit]['body'].tolist()

                # Calculate perplexity for the current subreddit using the model
                perplexities = calculate_perplexity(subreddit, other_comments)

                # Store the results in a dictionary
                result_object = {'subreddit1': subreddit, 'subreddit2': other_subreddit, 'perplexities': perplexities}

                # Save the result
                with open(filename, "w") as r:
                    json.dump(result_object, r)


# 🧪 Evaluate formal translations

In [None]:
import os
import json
import evaluate
import numpy as np
from transformers import AutoTokenizer

trainings_data_path = 'training_labeled_with_style_samples.json'
eval_data_path = 'eval_labeled_with_style_samples.json'

# load models
bert_score_model = evaluate.load("bertscore")
perplexity_model = evaluate.load("perplexity", module_type="measurement")
bleu_model = evaluate.load("chrf")

# merge eval and training file for evaluation
def merge_json_files(file1, file2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        data1 = json.load(f1)
        data2 = json.load(f2)

    merged_data = {}
    merged_data['data'] = []
    merged_data['data'].extend(data1['data'])
    merged_data['data'].extend(data2['data'])
    return merged_data

tmp_scores = {}
####################################################
# Create scores for the GYAFC-Corpus as baseline   #
####################################################

# We cannot calculate the metrics for the GYAFC dataset
# because it is not publicly available

####################################################
#         Create scores for the dataset            #
####################################################

# load prepared data
trainings_data_json = merge_json_files(trainings_data_path,
                                                     eval_data_path)

references = []
predictions = []
for trainings_object in trainings_data_json["data"]:
    if len(trainings_object['input_sentence']) == 0:
        continue
    references.append([trainings_object['result_sentence']])
    predictions.append(trainings_object['input_sentence'])

# create a flat list of references, needed for the calculation of BERTScore
flat_references = [item for sublist in references for item in sublist]

print("About to calculate dataset label scores")

# calculate scores
tmp_scores['Dataset'] = {}
tmp_scores['Dataset']['bert_score'] = bert_score_model.compute(predictions=predictions, references=flat_references,
                                                               lang="en", model_type="microsoft/deberta-xlarge-mnli")
tmp_scores['Dataset']['bleu_score'] = bleu_model.compute(predictions=predictions, references=references, word_order=2)
tmp_scores['Dataset']['perplexity_score'] = perplexity_model.compute(data=predictions, model_id='gpt2')

print("Dataset label scores calculated")

result_obj = {}
# create result object
for dataset in ['Dataset']:
    # BERT
    bert_scores = tmp_scores[dataset]['bert_score']
    bert_f1_mean_score = sum(bert_scores['f1']) / len(bert_scores['f1'])
    bert_precision_mean_score = sum(bert_scores['precision']) / len(bert_scores['precision'])
    bert_recall_mean_score = sum(bert_scores['recall']) / len(bert_scores['recall'])

    bert_score = {'mean_f1': bert_f1_mean_score, 'mean_precision': bert_precision_mean_score,
                  'mean_recall': bert_recall_mean_score}
    # Perplexity
    perplexity_score = tmp_scores[dataset]['perplexity_score']
    median_perplexity = np.median(perplexity_score['perplexities'])
    variance_perplexity = np.var(perplexity_score['perplexities'])
    perplexity = {'perplexity_median': median_perplexity, 'perplexity_variance': variance_perplexity,
                  'perplexity_mean': perplexity_score['mean_perplexity']}
    # save results in object
    result_obj[dataset] = {'BLEU': tmp_scores[dataset]['bleu_score'], 'Perplexity': perplexity,
                           'BERTScore': bert_score}

# save result object
results_file = "dataset_label_eval_result.json"
with open(results_file, "w") as r:
    print(f"Saved Eval Results in: {results_file}")
    json.dump(result_obj, r)


# 📊 Create charts

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt

# Set the directory where the data files are stored
data_dir = 'dataset_eval_results'

# Get a list of all the subreddit names
subreddits = set()
for filename in os.listdir(data_dir):
    # Check if the file is a JSON file
    if filename.endswith('perplexity.json'):
        # Load the JSON data from the file
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)

        # Get the subreddit names from the data
        subreddit1 = data['subreddit1']
        subreddit2 = data['subreddit2']
        subreddits.add(subreddit1)
        subreddits.add(subreddit2)

# Sort the list of subreddit names alphabetically
subreddits = sorted(list(subreddits))

# Create an empty matrix to store the median perplexity values
matrix = np.zeros((len(subreddits), len(subreddits)))

# Loop through all the files in the directory
for filename in os.listdir(data_dir):
    # Check if the file is a JSON file
    if filename.endswith('perplexity.json'):
        # Load the JSON data from the file
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)

        # Get the subreddit names and perplexities from the data
        subreddit1 = data['subreddit1']
        subreddit2 = data['subreddit2']
        perplexities = np.array(data['perplexities'])

        # Calculate the median perplexity score
        median_perplexity = np.median(perplexities)

        # Add the median perplexity to the matrix
        i = subreddits.index(subreddit1)
        j = subreddits.index(subreddit2)
        matrix[i, j] = median_perplexity
        # matrix[j, i] = median_perplexity

# Create the matrix plot
fig, ax = plt.subplots(figsize=(12, 12))
im = ax.imshow(matrix, cmap='viridis')

# Set the tick labels
ax.set_xticks(np.arange(len(subreddits)))
ax.set_yticks(np.arange(len(subreddits)))
ax.set_xticklabels(subreddits, rotation=90)
ax.set_yticklabels(subreddits)

# Set the axis labels
ax.set_xlabel('Subreddit')
ax.set_ylabel('Subreddit')

# Add the colorbar
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel('Median Perplexity', rotation=-90, va='bottom')

# Add the title
ax.set_title('Median Perplexity Matrix')

# Show the plot
plt.show()

##############################################################

# Get a list of all the subreddit names
subreddits = set()
for filename in os.listdir(data_dir):
    # Check if the file is a JSON file
    if filename.endswith('overlap.json'):
        # Load the JSON data from the file
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)

        # Get the subreddit names from the data
        subreddit1 = data['subreddit1']
        subreddit2 = data['subreddit2']
        subreddits.add(subreddit1)
        subreddits.add(subreddit2)

# Sort the list of subreddit names alphabetically
subreddits = sorted(list(subreddits))

# Create an empty matrix to store the median perplexity values
matrix = np.zeros((len(subreddits), len(subreddits)))

# Loop through all the files in the directory
for filename in os.listdir(data_dir):
    # Check if the file is a JSON file
    if filename.endswith('overlap.json'):
        # Load the JSON data from the file
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)

        # Get the subreddit names and overlap from the data
        subreddit1 = data['subreddit1']
        subreddit2 = data['subreddit2']
        subreddit1_total_tokens = data['subreddit1_total_tokens']
        subreddit2_total_tokens = data['subreddit2_total_tokens']
        overlap = data['overlap_tokens']

        total_tokens = subreddit1_total_tokens + subreddit2_total_tokens
        overlap_percent = (overlap / total_tokens) * 100

        # Add the median perplexity to the matrix
        i = subreddits.index(subreddit1)
        j = subreddits.index(subreddit2)
        matrix[i, j] = overlap_percent
        # matrix[j, i] = median_perplexity

# Create the matrix plot
fig, ax = plt.subplots(figsize=(12, 12))
im = ax.imshow(matrix, cmap='viridis')

# Set the tick labels
ax.set_xticks(np.arange(len(subreddits)))
ax.set_yticks(np.arange(len(subreddits)))
ax.set_xticklabels(subreddits, rotation=90)
ax.set_yticklabels(subreddits)

# Set the axis labels
ax.set_xlabel('Subreddit')
ax.set_ylabel('Subreddit')

# Add the colorbar
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel('Overlapping Tokens (in percent)', rotation=-90, va='bottom')

# Add the title
ax.set_title('Overlap Matrix')

# Show the plot
plt.show()

##############################################################

# Get a list of all the subreddit names
subreddits = set()
for filename in os.listdir(data_dir):
    # Check if the file is a JSON file
    if filename.endswith('overlap.json'):
        # Load the JSON data from the file
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)

        # Get the subreddit names from the data
        subreddit1 = data['subreddit1']
        subreddit2 = data['subreddit2']
        subreddits.add(subreddit1)
        subreddits.add(subreddit2)

# Sort the list of subreddit names alphabetically
subreddits = sorted(list(subreddits))

# Create an empty matrix to store the median perplexity values
matrix = np.zeros((len(subreddits), len(subreddits)))

# Loop through all the files in the directory
for filename in os.listdir(data_dir):
    # Check if the file is a JSON file
    if filename.endswith('overlap.json'):
        # Load the JSON data from the file
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)

        # Get the subreddit names and overlap from the data
        subreddit1 = data['subreddit1']
        subreddit2 = data['subreddit2']
        total_ngrams_subreddit1 = data['subreddit1_total_ngrams']
        total_ngrams_subreddit2 = data['subreddit2_total_ngrams']
        overlap = data['overlap_ngrams']

        total_ngrams = total_ngrams_subreddit1 + total_ngrams_subreddit2
        overlap_percent = (overlap / total_ngrams) * 100

        # Add the median perplexity to the matrix
        i = subreddits.index(subreddit1)
        j = subreddits.index(subreddit2)
        matrix[i, j] = overlap_percent
        # matrix[j, i] = median_perplexity

# Create the matrix plot
fig, ax = plt.subplots(figsize=(12, 12))
im = ax.imshow(matrix, cmap='viridis')

# Set the tick labels
ax.set_xticks(np.arange(len(subreddits)))
ax.set_yticks(np.arange(len(subreddits)))
ax.set_xticklabels(subreddits, rotation=90)
ax.set_yticklabels(subreddits)

# Set the axis labels
ax.set_xlabel('Subreddit')
ax.set_ylabel('Subreddit')

# Add the colorbar
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel('Overlapping Bigrams (in percent)', rotation=-90, va='bottom')

# Add the title
ax.set_title('Overlap Matrix')

# Show the plot
plt.show()