# ⚙️ Install Libraries and Download Dataset

In [None]:
!pip install openai zenodo-get pandas

!zenodo_get 10.5281/zenodo.8023142

# ✍️ Create Formal Comments

Don't forget to enter your Open API Key in the variable API_KEY

In [None]:
import json
import re
import pandas
import statistics
import openai
from tqdm import tqdm

API_KEY = "ENTER_API_KEY_HERE"

# Change this sentence if you want to test other prompts
style_change_sentence = "Here is a rewrite of the text, which is more neutral:"

MIN_PERPLEXITY = 200
number_of_comments_to_translate_per_subreddit = 150
result_file_name = 'comments_to_neutral.json'

data = pandas.io.parsers.read_csv("reddit_comments.csv")
data['perplexity'] = data.perplexity.astype(float)
data = data.query(f"perplexity > {MIN_PERPLEXITY}")

comment_lengths = []
comment_candidates = []
subreddits = data['subreddit']
unique_subreddits = subreddits.unique()
for subreddit in unique_subreddits:
    subreddit_data = data.query("subreddit.str.contains('" + subreddit + "')")
    comments = []
    for index, row in subreddit_data.iterrows():
        comment_lengths.append(len(row.body))
        comments.append({'body': row.body, 'subreddit': row.subreddit, 'perplexity': row.perplexity,
                         'token_size': row.token_size})

    filtered_comments = filter(lambda comment: comment['token_size'] >= 10, comments)
    sorted_comments = sorted(filtered_comments, key=lambda x: x['perplexity'], reverse=True)
    highest_perplexity_comments = sorted_comments[:number_of_comments_to_translate_per_subreddit]
    comment_candidates.extend(highest_perplexity_comments)
    # comment_candidates.extend(random.sample(comments, number_of_comments_to_translate_per_subreddit))


print('Total comments: ' + str(len(data.index)))
print('Filtered comments: ' + str(len(comment_candidates)))
print('Median comment length of filtered comments: ' + str(statistics.median(comment_lengths)))

prompts_to_send = []
for comment in comment_candidates:
    body = comment['body'].replace("\\n", "")
    body = re.sub(' +', ' ', body)
    prompt = f"Here is some text: {body} {style_change_sentence} {{"
    prompts_to_send.append({'prompt': prompt, 'body': body, 'subreddit': comment['subreddit']})


def save_results(responses_to_save):
    json_object = json.dumps({'data': responses_to_save}, indent=4)
    with open(result_file_name, "w") as outfile:
        outfile.write(json_object)


# print(f"About to translate {number_of_comments_to_translate} to a more formal style.")
responses = []
openai.api_key = API_KEY
for idx, prompt in enumerate(tqdm(prompts_to_send, desc="Translate comments to formal sentences")):
    response = ""
    try:
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt['prompt'],
            temperature=0.2,
            max_tokens=512,
            top_p=1,
            frequency_penalty=0.1,
            presence_penalty=0
        )
    except Exception as e:
        save_results(responses)
        print("Exception occured (Previous results have been saved): " + str(e))
    responses.append({'response': response, 'body': prompt['body'], 'subreddit': prompt['subreddit']})

save_results(responses)
print("Work completed")
