In [None]:
!python3.8 -m pip install jellyfish

In [None]:
!python3.8 -m pip install demoji

In [None]:
from os import listdir
import pandas as pd
import re
import jellyfish
import demoji
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
raw_files_folder = 'sd_dreambots_dotnet_raw_output'
raw_discord_files = listdir(raw_files_folder)
raw_discord_df_list = list()
for i, raw_filename in enumerate(raw_discord_files):
    df = pd.read_csv(raw_files_folder + '/' + raw_filename)
    df['Channel'] = 'dreambot-' + str(i+1)
    raw_discord_df_list.append(df)

In [None]:
all_channels_raw_df = pd.concat(raw_discord_df_list).reset_index(drop=True)
all_channels_raw_df.columns = [x.lower() for x in all_channels_raw_df.columns]
all_channels_raw_df

In [None]:
all_channels_raw_df.info()

In [None]:
all_channels_raw_df = all_channels_raw_df[all_channels_raw_df.content.notna()]
all_channels_raw_df.info()

"Dreamt" is a successful execution. "Dreaming" is when a person sends a request and this thing shows that the process has started. We don't need the rest

In [None]:
only_dreamt = all_channels_raw_df[
    all_channels_raw_df.content.apply(lambda x: x.split()[0])=='Dreamt'
].reset_index(drop=True)

In case of very sexual content bot drops warning 'There were naughty dreams that had to be popped'

In [None]:
len(only_dreamt)

In [None]:
only_dreamt.content.apply(
    lambda x: ('There were naughty dreams that had to be popped' in x)
).sum()

In [None]:
only_dreamt.content.apply(
    lambda x: ('There were naughty dreams that had to be popped' in x) and ('sex' not in x)
).sum()

In [None]:
only_dreamt.content.apply(
    lambda x: ('There were naughty dreams that had to be popped' in x) or ('sex' in x)
).sum()

In [None]:
only_dreamt = only_dreamt[
    ~only_dreamt.content.apply(
        lambda x: ('There were naughty dreams that had to be popped' in x) or ('sex' in x)
    )
].reset_index(drop=True)

In [None]:
len(only_dreamt)

In [None]:
only_dreamt.content.apply(lambda x: x.split('/dream prompt:')).apply(len).value_counts()

It can be seen that there is usually one prompt in the messages (i.e. a list of length 2). We will leave only these messages

In [None]:
only_dreamt = only_dreamt[
    only_dreamt.content.apply(lambda x: x.split('/dream prompt:')).apply(len) == 2
].reset_index(drop=True)

In [None]:
only_dreamt

Most messages contain one line that says 'for @' (indicating the author). Cases where this substring is missing or there are multiple instances will not be considered.

In [None]:
only_dreamt.content.apply(lambda x: x.split('for @')).apply(len).value_counts()

In [None]:
only_dreamt.content.sample(3).values

In [None]:
only_dreamt = only_dreamt[only_dreamt.content.apply(lambda x: x.split('for @')).apply(len)==2]
only_dreamt['username'] = only_dreamt.content.apply(lambda x: x.split('for @')[1].split('... ')[0])
only_dreamt.content = only_dreamt.content.apply(lambda x: x.split('/dream prompt:')[1][:-1])
only_dreamt.content = only_dreamt.content.apply(lambda x: x.split('negative_prompt:')[0])

Removing arguments for models and WARNINGs

In [None]:
only_dreamt.content = only_dreamt.content.apply(lambda x: re.sub(' [^ ]*:.*', '', x))
only_dreamt.content = only_dreamt.content.apply(lambda x: x.split('\n**WARNING:**')[0])

In [None]:
only_dreamt.date = only_dreamt.date.apply(pd.to_datetime)

In [None]:
filtered = only_dreamt.copy().reset_index(drop=True)

Remove duplicates

In [None]:
filtered = filtered[~filtered['content'].duplicated()].reset_index(drop=True)

In [None]:
filtered

Users often try different prompts for one picture. This happens within a few minutes/hours: they make small modifications and send the message again. The instance on the picture does not change. It is logical to find such messages and leave one of these similar ones (the latest one), so that later we won't spend money on extra labeling

In [None]:
filtered.sort_values(by=['username', 'date'], ascending=False)

In [None]:
time_window = pd.Timedelta(hours=1)
similarity_threshold = 0.6

filtered_similar = pd.DataFrame(columns=filtered.columns)

for row in tqdm(filtered.sort_values(by=['username', 'date'], ascending=False).iterrows(), total=len(filtered)):
    row = row[1]
    user_rows = filtered_similar[filtered_similar.username == row.username]
    recent_user_rows = user_rows[user_rows.date.apply(lambda x: (x - row.date) < time_window)]
    if len(recent_user_rows) and max(
            recent_user_rows.content.apply(lambda x: jellyfish.jaro_distance(x, row.content))
        ) > similarity_threshold:
        continue
        
    filtered_similar = filtered_similar.append(pd.DataFrame([row]))

In [None]:
filtered_similar = filtered_similar.reset_index(drop=True)

In [None]:
filtered_similar.content[
    filtered_similar.content.apply(lambda x: x.lower().startswith('prompt'))
] = filtered_similar.content[
    filtered_similar.content.apply(lambda x: x.lower().startswith('prompt'))
].apply(lambda x: x[6:])

In [None]:
filtered_similar.content = filtered_similar.content.apply(lambda x: x.split('negative prompt')[0])
filtered_similar.content = filtered_similar.content.apply(lambda x: x.split('negative _ prompt')[0])
filtered_similar.content = filtered_similar.content.apply(lambda x: x.split('negative_prompt')[0])

In [None]:
filtered_similar.content[
    filtered_similar.content.apply(lambda x: x.endswith('seed') and len(x.split('seed'))==2)
] = filtered_similar.content[
    filtered_similar.content.apply(lambda x: x.endswith('seed') and len(x.split('seed'))==2)
].apply(lambda x: x.split('seed')[0])

filtered_similar.content[
    filtered_similar.content.apply(lambda x: 'cfg _ scale' in x)
] = filtered_similar.content[
    filtered_similar.content.apply(lambda x: 'cfg _ scale' in x)
].apply(lambda x: x.split('cfg _ scale')[0])

# -- often used for args at the end of string, but sometimes these args are in the middle.
filtered_similar = filtered_similar[
    ~filtered_similar.content.apply(lambda x: '--' in x and len(x.split('--')[1])>20)
]

# now let's remove these args from the end of the string
filtered_similar.content = filtered_similar.content.apply(lambda x: x.split('--')[0])

filtered_similar.content = filtered_similar.content.apply(lambda x: demoji.replace(x, ''))

In [None]:
def all_non_ascii(string):
    for s in string:
        if (s not in ' .,"-\'!–/;&?') and (not s.isdigit()) and s.isascii():
            return False
    return True
filtered_similar = filtered_similar[~filtered_similar.content.apply(all_non_ascii)]

def all_not_alpha(string):
    for s in string:
        if (s not in ' .,"-\'!–/;&?') and (not s.isdigit()) and s.isalpha():
            return False
    return True
filtered_similar = filtered_similar[~filtered_similar.content.apply(all_not_alpha)].reset_index(drop=True)

In [None]:
filtered_similar = filtered_similar[~filtered_similar.content.apply(lambda x: 'http' in x)]
filtered_similar.content = filtered_similar.content.apply(
    lambda x: x.replace('(', '').replace(')', '')\
    .replace('[', '').replace(']', '').replace('{', '').replace('}', '').replace('|', '')
)

In [None]:
def suspicious_string(string):
    for s in string:
        if (s not in ' .,"-\'!–/;&?+*_#|') and (not s.isdigit()) and (not s.isalpha()):
            return True
    return False

tmp = filtered_similar.content[filtered_similar.content.apply(suspicious_string)]
print(len(tmp))
filtered_similar = filtered_similar[~filtered_similar.content.apply(suspicious_string)]

In [None]:
# glue digits, e.g. "1 9 5 5 s" -> "1995s", "4 k" -> "4k", "3 d" -> "3d"
filtered_similar.content = filtered_similar.content.apply(
    lambda x: re.sub('(?<=\d)\s(?=\d|[KkdDs][^\w]|[KkdDs]$)', '', x)
)
# "word , word" -> "word, word"
filtered_similar.content = filtered_similar.content.apply(
    lambda x: re.sub('\s,', ',', x)
)
# "word   word" -> "word word"
filtered_similar.content = filtered_similar.content.apply(
    lambda x: re.sub('\s+', ' ', x)
)

In [None]:
def has_non_ascii(string):
    for s in string:
        if (s not in ' .,"-\'!–/;&?') and (not s.isdigit()) and (not s.isascii()):
            return True
    return False
print(filtered_similar.content.apply(has_non_ascii).sum())
filtered_similar = filtered_similar[~filtered_similar.content.apply(has_non_ascii)]

In [None]:
filtered_similar.content = filtered_similar.content.apply(lambda x: x.strip(': =\'.,"'))
filtered_similar = filtered_similar[
    ~filtered_similar.content.apply(lambda x: 'dreamt in' in x)
]
filtered_similar = filtered_similar[filtered_similar.content.apply(len)>2]
filtered_similar = filtered_similar[~filtered_similar.content.apply(lambda x: x.startswith('dreaming for'))]
filtered_similar = filtered_similar[~filtered_similar.content.apply(len)<=2].reset_index(drop=True)
filtered_similar = filtered_similar[~filtered_similar.content.duplicated()]
filtered_similar = filtered_similar.sort_values(by='date').reset_index(drop=True)

In [None]:
filtered_similar[['content', 'attachments', 'username']].to_csv('cleaned_prompts.tsv', sep='\t')