### Manual correction of flags, and filtering of the data

Copyright &copy; 2024 Praneeth Vadlapati

In [1]:
import os
import pandas as pd
import gradio as gr
from common_functions import get_filename, get_bot_response, \
				harm_categories, unwanted_flags, safe_flag, \
				print_progress, print_error, is_na, is_not_na, \
				get_filename, get_latest_index

latest_index = get_latest_index()
admin_flag_path = get_filename(latest_index, 'admin-flag')
flagged_file_path = get_filename(latest_index, 'flagged')
filtered_data_filename = get_filename(latest_index, 'filtered')
shortened_text_filename = get_filename(latest_index, 'shortened')
flagged_df = pd.read_csv(flagged_file_path)

# Data cleaning
def clean_column(column):
	try:
		column = column.replace('unsafe', 'unusable')
		column = column.apply(lambda x: safe_flag if safe_flag in x and len(x) > len(safe_flag) else x)
		return column
	except:
		return column  # return directly if it's not a string
columns_to_clean = ['text_unsafe', 'flags', 'flags_singlerow']  # 
for column in columns_to_clean:
	flagged_df[column] = clean_column(flagged_df[column])

try:
	admin_flag_df = pd.read_csv(admin_flag_path)
except FileNotFoundError:
	# If admin file is not found, create a new one
	admin_flag_df = flagged_df[['id']].copy()
	admin_flag_df['admin_unsafe_tags'] = None
	admin_flag_df['admin_unwanted_flags'] = None
	admin_flag_df['admin_flag_reason'] = None
	admin_flag_df.to_csv(admin_flag_path, index=False)

row_count = len(admin_flag_df)
admin_flag_df.head(2)

Unnamed: 0,id,admin_unsafe_tags,admin_unwanted_flags,admin_flag_reason
0,<urn:uuid:faff9b64-041c-4b98-8be4-7ff2a02e4b8d>,safe,unusable,"Report the feedback or report, discrimination,..."
1,<urn:uuid:77695799-0774-42a1-8eaa-5efbe154c4e0>,safe,safe,"Home renovations, mailboxes, and shed guidelin..."


In [2]:
flagged_df.head(2)

Unnamed: 0,id,url,text,text_unsafe,domain_unsafe,domain_unindexed,flags,flag_reason,flags_singlerow
0,<urn:uuid:faff9b64-041c-4b98-8be4-7ff2a02e4b8d>,http://38.paulosimoes.net/forms/feedback,We want to know how to best serve you. Please ...,safe,,,"sensitive_topic,unusable","Report discrimination, harassment, and sexual ...",safe
1,<urn:uuid:77695799-0774-42a1-8eaa-5efbe154c4e0>,http://aberdeencreekfl.com/ACCBusiness/Procedu...,Architectural Control Committee Policies and F...,safe,True,,safe,Useful knowledge,safe


## Manually correcting the flags

In [3]:
# find first row with missing flags
any_empty = False
for current_index in range(row_count):
	if is_na(admin_flag_df.at[current_index, 'admin_unsafe_tags']) \
			or is_na(admin_flag_df.at[current_index, 'admin_unwanted_flags']):
		any_empty = True
		break

# iterated until last row with no missing flags
if current_index == row_count - 1 and not any_empty:
	current_index = row_count + 1


def df_preview() -> str:
	start_index = max(current_index - 2, 0)
	end_index = min(current_index + 3, row_count)
	df_display = admin_flag_df.iloc[start_index:end_index].copy()

	df_display.drop(columns=['id', 'text_unsafe', 'flags', 'flag_reason', 'flag_reason'], 
						errors='ignore', inplace=True)
	df_display['text'] = flagged_df['text'].copy()
	df_display['text'] = df_display['text'].str.slice(0, 100).replace('\n', '<br>') + '...'
	return df_display.to_markdown()

def get_current_value(column, original_column, return_string=False) -> str | list:
	val = (
		admin_flag_df.at[current_index, column] if not is_na(admin_flag_df.at[current_index, column])
		else flagged_df.at[current_index, original_column] if not is_na(flagged_df.at[current_index, original_column])
		else None
	)
	val = val.split(',') if val else []
	if return_string:
		return ','.join(val)
	return val

def get_current_harmful_values() -> list[str]:
	keys = get_current_value('admin_unsafe_tags', 'text_unsafe')
	if safe_flag in keys:
		keys.remove(safe_flag)
	return [f'{key}: {harm_categories.get(key.strip(), True)}' for key in keys]

def update_ui():
	if current_index >= row_count:
		return '## Evaluation complete!', df_preview(), gr.update(interactive=False), \
				gr.update(interactive=False), gr.update(interactive=False), \
				gr.update(interactive=False), gr.update(interactive=False)
	return f'Text {current_index}: {flagged_df.at[current_index, "text"]}', df_preview(), \
			get_current_harmful_values(), get_current_value('admin_unwanted_flags', 'flags'), \
			get_current_value('admin_flag_reason', 'flag_reason', return_string=True), \
			gr.update(interactive=True), gr.update(interactive=True)

def next_text():
	global current_index
	if current_index < row_count - 1:
		current_index += 1
	return update_ui()

def previous_text():
	global current_index
	if current_index > 0:
		current_index -= 1
	return update_ui()

def restart_text():
	global current_index
	current_index = 0
	return update_ui()

def goto_index(index):
	global current_index
	current_index = int(index)
	if current_index < 0:
		current_index = 0
	if current_index >= row_count:
		current_index = row_count - 1
	return update_ui()

In [4]:
def evaluate_text(harmful_flags_input, unwanted_flags_input, flag_reason):
	global current_index
	if current_index >= row_count:
		return update_ui()

	if harmful_flags_input:
		harmful_flags_input = [flag.split(':')[0].strip() for flag in harmful_flags_input]
		harmful_flags_input = [flag for flag in harmful_flags_input if flag in harm_categories]
		admin_flag_df.at[current_index, 'admin_unsafe_tags'] = ','.join(harmful_flags_input) or safe_flag
	else:
		admin_flag_df.at[current_index, 'admin_unsafe_tags'] = safe_flag

	if unwanted_flags_input:
		unwanted_flags_input = [flag.strip() for flag in unwanted_flags_input]
		unwanted_flags_input = [flag for flag in unwanted_flags_input if flag in unwanted_flags]
		admin_flag_df.at[current_index, 'admin_unwanted_flags'] = ','.join(unwanted_flags_input) or safe_flag
	else:
		admin_flag_df.at[current_index, 'admin_unwanted_flags'] = safe_flag
	admin_flag_df.at[current_index, 'admin_flag_reason'] = flag_reason or ''

	admin_flag_df.to_csv(admin_flag_path, index=False)
	current_index += 1

	if current_index >= row_count:
		# # Save backup. Human effort can't be reproduced easily and shouldn't be lost.
		# admin_flag_df.to_csv(admin_flag_path+'.bak', index=False)
		return update_ui()

	return f'Text {current_index}: {flagged_df.at[current_index, "text"]}', df_preview(), \
			get_current_harmful_values(), get_current_value('admin_unwanted_flags', 'flags'), \
			get_current_value('admin_flag_reason', 'flag_reason', return_string=True), \
			gr.update(interactive=True), gr.update(interactive=True)

with gr.Blocks() as app:
	gr.Markdown('# Unwanted Text Flags Evaluation')
	if current_index >= row_count:
		current_index = row_count - 1

	text_output = gr.Markdown(value=f'Text {current_index}: {flagged_df.at[current_index, "text"]}')
	harm_categories_keys = [f'{key}: {value}' for key, value in harm_categories.items()]
	harmful_flags_input = gr.Dropdown(label='Is the text unsafe?', multiselect=True, 
		choices=harm_categories_keys, value=get_current_harmful_values(), 
	)
	unwanted_flags_input = gr.Dropdown(label='Unwanted flags?', 
		choices=unwanted_flags, multiselect=True, 
		value=get_current_value('admin_unwanted_flags', 'flags'), 
	)
	flag_reason = gr.Textbox(label='Flag reason', 
		value=get_current_value('admin_flag_reason', 'flag_reason', return_string=True), 
	)
	with gr.Row():
		submit_button = gr.Button('Submit', size='sm', elem_id='submit_button')
	with gr.Row():
		previous_button = gr.Button('Previous', size='sm', elem_id='previous_button')
		next_button = gr.Button('Next', size='sm', elem_id='next_button')
		restart_button = gr.Button('Restart', size='sm', elem_id='restart_button')
	with gr.Row():
		goto_input = gr.Number(label='Goto', value=current_index, precision=0)
		goto_button = gr.Button('Goto', size='sm')

	df_output = gr.Markdown(label='DataFrame Preview', value=df_preview())

	outputs = [text_output, df_output, harmful_flags_input, unwanted_flags_input,
				flag_reason, submit_button]
	submit_button.click(
		evaluate_text, inputs=[harmful_flags_input, unwanted_flags_input, flag_reason], 
		outputs=outputs
	)
	previous_button.click(previous_text, outputs=outputs)
	next_button.click(next_text, outputs=outputs)
	restart_button.click(restart_text, outputs=outputs)
	goto_button.click(goto_index, inputs=[goto_input], outputs=outputs)

	# Set up keyboard shortcuts
	app.load(js="""
		function setup_keyboard_shortcuts() {
			document.addEventListener('keydown', function(e) {
				if (e.key === 'Enter') {
					document.getElementById('submit_button').click();
				} else if (e.key === 'Escape') {
					// document.getElementById('end_edit_button').click();
				} else if (e.ctrlKey && e.key === 'ArrowRight') {
					document.getElementById('next_button').click();
				} else if (e.ctrlKey && e.key === 'ArrowLeft') {
					document.getElementById('previous_button').click();
				}
			});
		}
		if (window.setup_keyboard_shortcuts_called === undefined) {
			setup_keyboard_shortcuts();
			window.setup_keyboard_shortcuts_called = true;
		}
	""")
	app.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


## Copying admin flags to filter data

In [5]:
# Integrating admin_df columns into flagged df using ID - admin_unsafe_tags, admin_unwanted_flags

filtered_df = flagged_df.copy()

columns_to_copy = {
	'admin_unsafe_tags': 'text_unsafe',  # original column
	'admin_unwanted_flags': 'flags',
	# 'admin_flag_reason': 'flag_reason'
}

for admin_column, original_column in columns_to_copy.items():
	filtered_df[original_column] = admin_flag_df[admin_column].copy()

filtered_df.drop(columns=['flag_reason'], errors='ignore', inplace=True)
filtered_df.head(2)

Unnamed: 0,id,url,text,text_unsafe,domain_unsafe,domain_unindexed,flags,flags_singlerow
0,<urn:uuid:faff9b64-041c-4b98-8be4-7ff2a02e4b8d>,http://38.paulosimoes.net/forms/feedback,We want to know how to best serve you. Please ...,safe,,,unusable,safe
1,<urn:uuid:77695799-0774-42a1-8eaa-5efbe154c4e0>,http://aberdeencreekfl.com/ACCBusiness/Procedu...,Architectural Control Committee Policies and F...,safe,True,,safe,safe


## Filtering using flagged data

In [6]:
filtered_df.drop(columns=['url'], errors='ignore', inplace=True)

flag_columns = ['text_unsafe', 'domain_unsafe', 'domain_unindexed', 'flags']

for column in flag_columns:
	flagged_df[column] = clean_column(flagged_df[column])

# replace some strings with booleans
filtered_df[flag_columns] = filtered_df[flag_columns].replace({
	'True': True, 'true': True,
	'False': None, 'false': None, False: None,
	'Safe': None, 'safe': None, 
	'None': None, 'none': None, '': None,
	float('nan'): None,
})

scores_df = filtered_df[flag_columns].copy()  # to use later for heatmap data generation

# Print value counts for unsafe tags and transform using harm categories
unsafe_count = filtered_df['text_unsafe'].str.split(',').explode().str.strip().value_counts()
unsafe_count_transformed = unsafe_count.rename(index=harm_categories)
print(unsafe_count_transformed)
print('')

# print count of each value in unwanted flags. if it has multiple values, take first value
flags_count = filtered_df['flags'].str.split(',').explode().str.strip().value_counts()
print(flags_count)
print('')

removal_reason_data = {}  # 'text_unsafe': 10, ...
removed_rows = 0

# drop if any flag is not None
# filtered_df = filtered_df[~filtered_df[columns_to_remove].any(axis=1)]
# filter using each column
print('Reasons for removal:')
for column in flag_columns:
	removal_count = filtered_df[column].notna().sum()
	removed_rows += removal_count
	print(f'{column}: {removal_count}')
	filtered_df = filtered_df[filtered_df[column].isna()]  # allow rows with no flags
filtered_df.drop(columns=flag_columns+ ['flags_singlerow'], inplace=True)  #  
filtered_df.reset_index(drop=True, inplace=True)
print(f'Removed rows: {flagged_df.shape[0] - filtered_df.shape[0]} of {flagged_df.shape[0]}')
print(f'Retained rows: {filtered_df.shape[0]}')

# filtered_df.to_csv(filtered_data_filename, index=False)
filtered_df.head(2)

text_unsafe
Specialized Advice    4
Sexual Content        3
Privacy               2
Sex-Related Crimes    1
Non-Violent Crimes    1
Name: count, dtype: int64

flags
unusable           33
advertisement      30
sensitive_topic    12
biased              4
scam                2
religious           2
lottery             2
Name: count, dtype: int64

Reasons for removal:
text_unsafe: 9
domain_unsafe: 3
domain_unindexed: 5
flags: 59
Removed rows: 76 of 100
Retained rows: 24


Unnamed: 0,id,text
0,<urn:uuid:3bb2ccac-eb5e-4827-b321-95ca06142138>,Yesterday was a day with no apparent progress....
1,<urn:uuid:f0138133-8a24-4737-92be-764f68438d0b>,"Bengaluru, 6th May 2023: Indian Men’s Hockey T..."


## Analytics on accuracy of LLM-based flagging and safety

Aim of analytics: An undesirable row should get flagged by the LLM, and accuracy of the type of flag does not matter. \
For example, if the row is scam but gets flagged as spam, it is considered as true positive.

In [7]:
# replace 'safe' with None
analytics_df = flagged_df.copy()

columns_to_copy = {
	'admin_unsafe_tags': 'text_unsafe',
	'admin_unwanted_flags': 'flags',
}

for admin_column, original_column in columns_to_copy.items():
	analytics_df[admin_column] = admin_flag_df[admin_column].copy()
	analytics_df[original_column] = flagged_df[original_column].copy()
analytics_df['flags_singlerow'] = flagged_df['flags_singlerow'].copy()

# Filter columns
analytics_df_columns = ['id', 'text_unsafe', 'admin_unsafe_tags', 'flags', 'admin_unwanted_flags', 'flags_singlerow']  # 
analytics_df = analytics_df[analytics_df_columns]

analytics_df[analytics_df_columns[1:]] = analytics_df[analytics_df_columns[1:]].fillna(False).replace(safe_flag, False)
# All other values should be true
analytics_df[analytics_df_columns[1:]] = analytics_df[analytics_df_columns[1:]].apply(
	lambda col: col.map(lambda x: bool(x and x.strip()))  # stripped value should not be empty
)

def calculate_metrics(corrected_values, predicted_values):
	true_positives = ((corrected_values == True) & (predicted_values == True)).sum()
	false_positives = ((corrected_values == False) & (predicted_values == True)).sum()
	true_negatives = ((corrected_values == False) & (predicted_values == False)).sum()
	false_negatives = ((corrected_values == True) & (predicted_values == False)).sum()
	accuracy = (true_positives + true_negatives) / len(corrected_values)
	return true_positives, false_positives, true_negatives, false_negatives, accuracy

# Calculate metrics and display results
tp_flags, fp_flags, tn_flags, fn_flags, accuracy_flags = calculate_metrics(
	analytics_df['admin_unwanted_flags'], analytics_df['flags']
)
tp_text_unsafe, fp_text_unsafe, tn_text_unsafe, fn_text_unsafe, accuracy_text_unsafe = calculate_metrics(
	analytics_df['admin_unsafe_tags'], analytics_df['text_unsafe']
)
tp_flags_singlerow, fp_flags_singlerow, tn_flags_singlerow, fn_flags_singlerow, accuracy_flags_singlerow = calculate_metrics(
	analytics_df['admin_unwanted_flags'], analytics_df['flags_singlerow']
)

results = {
	'Metric': ['True Positives', 'False Positives', 'True Negatives',
            	'False Negatives', 'Accuracy'],
	'text_unsafe': [tp_text_unsafe, fp_text_unsafe, tn_text_unsafe,
                 	fn_text_unsafe, f'{accuracy_text_unsafe * 100:.2f}%'],
	'flags': [tp_flags, fp_flags, tn_flags, fn_flags, f'{accuracy_flags * 100:.2f}%'],
	# 'flags_singlerow': [tp_flags_singlerow, fp_flags_singlerow, tn_flags_singlerow,
	# 					fn_flags_singlerow, f'{accuracy_flags_singlerow * 100:.2f}%'],
}

# # print indices for which flags are not the same as admin_unwanted_flags
# print(analytics_df[analytics_df['flags'] != analytics_df['admin_unwanted_flags']].index.to_list())

pd.DataFrame(results)

Unnamed: 0,Metric,text_unsafe,flags
0,True Positives,7,62
1,False Positives,1,0
2,True Negatives,90,27
3,False Negatives,2,11
4,Accuracy,97.00%,89.00%


## Optimize text for fine-tuning

In [8]:
columns_to_keep = ['text', 'id', 'date']
columns_to_keep = [col for col in columns_to_keep if col in filtered_df.columns]
filtered_df = filtered_df[columns_to_keep]

# consider only columns - text
try:
	short_text_df = pd.read_csv(shortened_text_filename)
	new_filtered_df = filtered_df[columns_to_keep].copy()

	# keep rows that are in filtered_df and remove others
	short_text_df = short_text_df[short_text_df['id'].isin(new_filtered_df['id'])]
	# add missing rows from filtered_df
	missing_rows = new_filtered_df[~new_filtered_df['id'].isin(short_text_df['id'])]
	if not missing_rows.empty:
		short_text_df = pd.concat([short_text_df, missing_rows])
		short_text_df = short_text_df.drop_duplicates(subset='id')
except FileNotFoundError:
	short_text_df = filtered_df[columns_to_keep].copy()
	short_text_df['finetune_text'] = None

# take each row and ask groq to shorten the text and make it suitable for fine-tuning dataset
shortener_prompt_template = (
	'You are a content moderator who is preparing a dataset for fine-tuning a language model. '
	'You have a text that needs to be shortened and made suitable for the dataset. \n'
	'Return the optimized text in the triple backticks. '
	'Retain important details like Date and Location. \n'
	'Original text: ```\n{original_text}\n```'
)

def get_shorter_text(original_text, max_retries=3):
	for _ in range(max_retries):
		try:
			response = get_bot_response(messages=[
				{ 'role': 'user', 'content': shortener_prompt_template.format(original_text=original_text) }
			])
			# replace single backticks with triple backticks
			if '```' not in response:
				response = response.replace('`', '```')
			response = response.replace('```\n```', '```')
			# get the value from triple backticks
			response = response.split('```')[1].strip()
			if response:
				return response
			else:
				raise Exception('Empty response')
		except Exception as e:
			print(f'Error: {e}. Retrying')

def shorten_text_df(df):
	# if text column is None, get shortened text using initial text
	initial_length_sum = 0
	shortened_length_sum = 0
	for i, row in df.iterrows():
		if pd.isna(row['finetune_text']):
			shorter_text = get_shorter_text(row['text'])
			if not shorter_text or len(shorter_text) > len(row['text']):
				print_error()
				continue
			df.loc[i, 'finetune_text'] = shorter_text.strip('`').strip()
			initial_length_sum += len(row['text'])
			shortened_length_sum += len(shorter_text)
			print_progress()

	saved_length = initial_length_sum - shortened_length_sum
	if initial_length_sum and saved_length:
		saved_percent = (saved_length / initial_length_sum) * 100
		print(f'\nReduced: {saved_length}/{initial_length_sum} characters ({saved_percent:.2f}%)')
	return df

shorten_text_df(short_text_df)
short_text_df.to_csv(shortened_text_filename, index=False)
print(f'Shortened text data size: {short_text_df.shape}')
short_text_df.head(2)

Shortened text data size: (24, 3)


Unnamed: 0,text,id,finetune_text
0,"Bengaluru, 6th May 2023: Indian Men’s Hockey T...",<urn:uuid:f0138133-8a24-4737-92be-764f68438d0b>,**Date:** 6th May 2023\n**Location:** Bengalur...
1,Saturday Night Jazz at the Black Dog presents ...,<urn:uuid:0c9c6845-fff5-4af4-9bd6-89ba498d96a5>,14-10-16 // VANCOUVER ART GALLERY\nFree public...


## Export the flags/reasons count to create a heatmap later

In [9]:
scores_df['text_unsafe_count'] = scores_df['text_unsafe'].apply(lambda x: len(x.split(',')) if is_not_na(x) else 0)
scores_df['flags_count'] = scores_df['flags'].apply(lambda x: len(x.split(',')) if is_not_na(x) else 0)

# Add 1 for each domain_unsafe and domain_unindexed flag
scores_df['domain_unsafe_count'] = scores_df['domain_unsafe'].apply(lambda x: 1 if x else 0)
scores_df['domain_unindexed_count'] = scores_df['domain_unindexed'].apply(lambda x: 1 if x else 0)

# Sum all counts to get the total flag_count
score_column = 'Reasons'
scores_df[score_column] = scores_df[['text_unsafe_count', 'flags_count', 'domain_unsafe_count', 'domain_unindexed_count']].sum(axis=1)

scores_df = scores_df[[score_column]]

# print max, min, 2nd min
max_score = scores_df[score_column].max()
min_score = scores_df[score_column].min()
min_score_count = scores_df[scores_df[score_column] == min_score].shape[0]
print(f'Max score: {max_score}, Min score: {min_score}')

# for each score, print number of rows with that score
for score in range(min_score, max_score + 1):
	count = scores_df[scores_df[score_column] == score].shape[0]
	print(f'Score {score}: {count}')
print()


# # use min-max normalization to scale the scores
# scores_df[score_column] = (scores_df[score_column] - min_score) / (max_score - min_score)
# scores_df[score_column] = scores_df[score_column].apply(lambda x: f'{x:.2f}').astype(float)

# create x,y values to create a heatmap matrix
num_rows = 5  # rows
num_cols = len(scores_df) // num_rows  # 20
print(f'Rows: {num_rows}, Columns: {num_cols}')

# y should be 0, 1, 2, 3, 4, 5, ... 19
scores_df['x'] = [i % num_cols for i in range(len(scores_df))]
# y should be 0, 0, 0, 0, 0, 1, 1, ... 4, 4
scores_df['y'] = [i // num_cols for i in range(len(scores_df))]

scores_df = scores_df[['x', 'y', score_column]]  # sort columns

latex_table_text = scores_df.to_csv(sep=' ', index=False)
latex_table_text = latex_table_text.strip()
# print(latex_table_text)

# write to a file in data_dir
with open(os.path.join('data', 'heatmap_data.txt'), 'w') as f:
	f.write(latex_table_text)
	print('Data written to heatmap_data.txt')

Max score: 4, Min score: 0
Score 0: 24
Score 1: 57
Score 2: 13
Score 3: 3
Score 4: 3

Rows: 5, Columns: 20
Data written to heatmap_data.txt
