Copyright &copy; 2024 Praneeth Vadlapati

<!-- ## Setup:

Example of .env: 
```python
# Use any OpenAI-compatible API provider
# Groq is preferred for llama3.1 due to free fast responses
LM_PROVIDER_BASE_URL=https://api.groq.com/openai/v1
LM_API_KEY=
LM_MODEL=llama-3.1-70b-versatile
```

Installing packages:
```bash
pip install openai python-dotenv ucimlrepo tiktoken
``` -->

## A. 	Loading a Dataset and Setting Up a Language Model

In [None]:
import os
from io import StringIO

from IPython.display import display, Markdown
from ucimlrepo import fetch_ucirepo
import pandas as pd
import tiktoken
from common_functions import get_lm_response, extract_data, \
    print_progress, print_error, model, data_folder, random_state

dataset_name = 'Rice (Cammeo and Osmancik)'
# 'Iris', 'Wine', 'Zoo', 'Raisin'
# 'Rice (Cammeo and Osmancik)'
# 'Statlog (German Credit Data)'
# 'Mushroom'

filename = os.path.join(data_folder, f'{dataset_name}.csv')
labelcol_filename = os.path.join(data_folder, f'{dataset_name}_labelcol.txt')

try:  # Try to load from the local files
	if not os.path.exists(filename):
		raise FileNotFoundError()
	data_df = pd.read_csv(filename)
	with open(labelcol_filename, 'r') as file:
		label_col = file.read().strip()
		if not label_col:
			err = f'Label column not found in {labelcol_filename}'
			print(err)
			raise ValueError(err)
		if label_col not in data_df.columns:
			err = f'Label column {label_col} not found in dataset'
			print(err)
			raise ValueError(err)
except Exception as e:
	print('Loading data from UCI repository')
	dataset = fetch_ucirepo(name=dataset_name)
	# Extract column names from metadata
	feature_columns = dataset.variables[dataset.variables['role'].str.lower() == 'feature']['name'].tolist()
	target_columns = dataset.variables[dataset.variables['role'].str.lower() == 'target']['name'].tolist()

	data_df = pd.concat([
		pd.DataFrame(dataset.data.features, columns=feature_columns),  # df_features,
		pd.DataFrame(dataset.data.targets, columns=target_columns),  # df_targets,
	], axis=1)
	# In column names, replace spaces with underscores
	data_df.columns = data_df.columns.str.replace(' ', '_').str.replace('-', '_')
	data_df.to_csv(filename, index=False)

	label_col = target_columns[-1].replace(' ', '_').replace('-', '_')
	with open(labelcol_filename, 'w') as file:
		file.write(label_col)

predicted_col = f'{label_col}_predicted'
train_data = f'```csv\n{data_df.to_csv(index=False).strip()}\n```'

attempts_limit = 3

# Data pre-processing
# allow only 2 decimal places for float dtypes
for col in data_df.select_dtypes(float).columns:
	data_df[col] = data_df[col].round(2)
# if the label column is not object, convert it to string
if data_df[label_col].dtype != object:
	data_df[label_col] = data_df[label_col].astype(str)

print(f'Model: {model}')
context_tokens_limit = 15_000  # lesser values help avoid rate limits

print(f'Data rows: {len(data_df)}')
data_df.head()

## B. 	Creating a Test Data

In [None]:
# Create lot of test data

# for each label, get count of rows with that label
label_counts = data_df[label_col].value_counts().to_dict()
test_df = data_df.copy()
test_df.drop(test_df.index, inplace=True)  # remove all rows

for label in label_counts:
	rows_count = label_counts[label]  # number of rows to create for that label
	same_label_df = data_df[data_df[label_col] == label].copy()
	numeric_cols = same_label_df.select_dtypes([int, float]).columns
	non_numeric_cols = same_label_df.dtypes.index.difference(numeric_cols).difference([label_col])

	test_rows_count = 10  # rows to create per label
	if rows_count < 40:
		test_rows_count = rows_count * 0.2
		if test_rows_count % 1 > 0.5:
			test_rows_count += 1
		test_rows_count = int(test_rows_count)
	if test_rows_count < 1 and rows_count >= 1:
		test_rows_count = 1
	
	size = 2  # number of rows to average at a time
	random_rows_count = test_rows_count * size  # number of random rows to select
	if random_rows_count > rows_count:
		random_rows_count = rows_count
	# Select 20 random rows
	random_rows = same_label_df.sample(n=random_rows_count, random_state=random_state).reset_index(drop=True)

	for i in range(0, random_rows_count, size):
		new_index = len(test_df)
		test_df.loc[new_index] = random_rows.iloc[i]  # copy data of the first row
		# test_df.loc[new_index, numeric_cols] = random_rows.loc[i:i+size-1, numeric_cols].mean()  # find averages of 2 rows
		for col in numeric_cols:
			test_df.loc[new_index, col] = random_rows.iloc[i:i+size][col].mean()
			if same_label_df[col].dtype == int:  # if source column has int type, convert the result to int
				test_df.loc[new_index, col] = int(test_df.loc[new_index, col])
		for col in non_numeric_cols:  # use the most frequent value for non-numeric columns
			mode_value = random_rows.iloc[i:i+size][col].dropna().mode()
			test_df.loc[new_index, col] = mode_value[0] if not mode_value.empty else None

	# count of rows with that label in the test data
	print(f'{label}: {len(test_df[test_df[label_col] == label])} from {rows_count}')


test_df.reset_index(drop=True, inplace=True)
print(f'Test data rows: {len(test_df)}')
print(f'Test data cols: {len(test_df.columns)}')
test_df.head()

## C. 	Dividing data into chunks
### Finding best number of chunks to divide the dataset

In [None]:
# use any tokenizer to count approximate number of tokens of all the models
enc = tiktoken.encoding_for_model('gpt-4')

chunk_count = 1  # current number of chunks
chunk_size = len(data_df) // chunk_count

def get_chunk_text(chunk_index, df=data_df):
	chunk_start = chunk_index * chunk_size
	chunk_end = chunk_start + chunk_size
	train_data_chunk = df.iloc[chunk_start:chunk_end].to_csv(index=False).strip()
	return train_data_chunk

while True:
	chunk_size = len(data_df) // chunk_count
	train_data_chunk = get_chunk_text(0)
	token_count = len(enc.encode(train_data_chunk))
	print_progress()
	if token_count > 0.85 * context_tokens_limit:
		chunk_count += 1  # keep increasing until the chunk size fits the limit
	else:
		break

query_chunk_size = chunk_size * 2
print(f'Chunk count: {chunk_count}')

## D. 	Summarizing Each Chunk of Data

In [4]:
patterns_prompt_template = '''
{train_data_chunk}
--
Act as an Explainable Machine Learning Model. Don't write code.
If there is a bias in the data, highlight it in bold first and how you will handle it.
If a small portion of data has unusual patterns or is suspicious, consider it noise or data poisoning,
mention it, and ignore it when creating a summary table.

Create a table by observing patterns for each label in the dataset.
Find exact patterns that separate each label from the rest.
Include the column name "Label ({label_column})", other columns from the data, and then "Num_rows" with the number of rows with that label.
Patterns should include be in "a, b, c" format for categories or "min-max [avg]" format for numbers.
Add a "Comments" column for each label to write comments about the patterns and any unusual patterns.
Write a table between tags <{tag}> and </{tag}>.
Ensure only 1 row per label is in the table.
The comments column is mandatory.

Available Labels: `{available_labels}`
'''.strip()

attempts_limit = 4
tag = 'patterns'
comments_col = 'Comments'
all_patterns = []

def comments_col_exists(df: pd.DataFrame):
	for col in df.columns:
		if comments_col.lower() in col.lower():
			return True
	return False

failed_chunks = []
chunks_errors = {}

error_message = None
chunk_result = {}  # 1: False, 2: None, ....
patterns_response = None

dataset_first_word = dataset_name.split()[0].lower()
# enable saving final summary
final_summary_filename = os.path.join(data_folder, f'FinalSummary-{model}-{dataset_name}.csv')

In [None]:
final_summary_df = None
if os.path.exists(final_summary_filename) and os.path.getsize(final_summary_filename):
	final_summary_df = pd.read_csv(final_summary_filename)

def save_final_summary(df: pd.DataFrame):
	df.to_csv(final_summary_filename, index=False)

def process_chunks():
	global chunk_result, failed_chunks, chunks_errors, patterns_response
	failed_chunks = []
	chunks_errors = {}
	for chunk_number in range(chunk_count):
		if chunk_number % 5 == 0:
			print_progress(chunk_number)
		# continue if value for chunk_number is already in all_patterns list (use length)
		if chunk_result.get(chunk_number, None) != None:
			print_progress(chunk_result[chunk_number])
			continue
		for _ in range(attempts_limit):
			try:
				patterns_prompt = patterns_prompt_template.format(
					label_column=label_col, 
					tag=tag, train_data_chunk=get_chunk_text(chunk_number),
					available_labels=', '.join(data_df[label_col].unique()),
	 			)
				patterns_response = get_lm_response(patterns_prompt)  # get predictions
				patterns_response = extract_data(patterns_response, tag=tag)
				summary_data_df = pd.read_csv(StringIO(patterns_response), sep='|', skiprows=0, skipinitialspace=True)
				# if no column contains "Comments", raise error
				if not comments_col_exists(summary_data_df):
					raise ValueError(f'{comments_col} column is missing')
				all_patterns.append(patterns_response)
				chunk_result[chunk_number] = True
				print_progress()
				error_message = None
				break  # finish the attempt
			except Exception as e:
				chunk_result[chunk_number] = None
				error_message = str(e) + patterns_response
				# print(error_message)
				print_error(e)
		# if failed for chunks, add chunk to an array
		if error_message:
			chunk_result[chunk_number] = None
			print_progress(f'F{chunk_number} ')
			failed_chunks.append(chunk_number)
			chunks_errors[chunk_number] = error_message

if final_summary_df is not None:
	print('Final Summary already loaded')
else:
	print(f'Chunk count: {chunk_count}')
	process_chunks()

	if len(failed_chunks):
		print(f'Failed chunks: {failed_chunks}')
		process_chunks()
		if len(failed_chunks):
			print(f'Failed chunks: {failed_chunks}')
			print(f'Chunks errors: {chunks_errors}')
			print('ERROR: Failed to generate patterns for some chunks')

	display(Markdown(patterns_response))

## E. 	Generating a Final Summary of All the Summaries

In [None]:
summary_patterns = all_patterns.copy()
patterns_chunk_size = len(summary_patterns) // chunk_count

def get_patterns_chunk(chunk_index, data=summary_patterns):
	chunk_start = chunk_index * patterns_chunk_size
	chunk_end = min((chunk_index + 1) * patterns_chunk_size, len(data))
	text = '\n---\n'.join(data[chunk_start:chunk_end])
	return text.strip()

# Finding best chunk size
patterns_chunk_count = 1
while True:
	patterns_chunk_size = len(summary_patterns) // patterns_chunk_count
	patterns = get_patterns_chunk(0)
	patterns_token_count = len(enc.encode(patterns))
	if patterns_token_count > 0.85 * context_tokens_limit:
		patterns_chunk_count += 1
	else:
		break
print(f'Patterns - chunk count: {patterns_chunk_count}')
print(f'Patterns - chunk size: {patterns_chunk_size}')
summary_data = None
summary_data_df = None
summary_data_text = None

In [None]:
summary_patterns = all_patterns.copy()

summary_prompt_template = '''
{all_summaries}
--
Act as an Explainable Machine Learning Model.

Each summary above is generated from each chunk of the dataset by finding patterns of the data that separate each label from the rest.
Write a table to combine the summaries into a single summary.
Include the column name "Label ({label_column})", other columns, and "Num_rows" with the total number of rows with that label.
Add a "Comments" column for each label to write comments about the patterns and any unusual patterns.
Use the total values using the Num_rows column of each summary.

Write a table between tags <{tag}> and </{tag}>.
Ensure only 1 row per label is in the table and no extra rows are present.
Available Labels: `{available_labels}`

Respond with CSV text like: ```csv
Label ({label_column}), col1, col2, col3, Num_rows
"1", a, b, "c, d, e", 100
"2", a2, b2, "c2, d2, e2", 200
```
The CSV must be readable in a Pandas DataFrame.
Include quotes for cells with commas. All labels must be strings.
CSV must be between tags <{tag}> and </{tag}>.
'''.strip()
tag = 'patterns'

def process_summary_data_df(summary_data):
	global summary_data_df
	# load above markdown text to CSV
	summary_data_df = pd.read_csv(StringIO(summary_data)) #, sep='|', skiprows=0, skipinitialspace=True)  #, index_col=False
	if summary_data_df.index[0] != 0 or summary_data_df.index.__class__.__name__ == 'MultiIndex':
		summary_data_df = pd.read_csv(StringIO(summary_data), index_col=False)
	summary_data_df.columns = summary_data_df.columns.str.strip()
	cols = summary_data_df.columns
	summary_data_df = summary_data_df.iloc[:, ~cols.str.contains('^Unnamed')]  # drop unnamed columns
	cols = summary_data_df.columns
	# find first column name that has 'Label' in it
	summary_label_col = cols[cols.str.contains('Label')][0]
	# convert to string. df label should be stripped
	summary_data_df[summary_label_col] = summary_data_df[summary_label_col].astype(str).str.strip().str.strip('*')
	# fill Num_rows column with proper values by counting labels taken form Label column of df,
	# and label_col of df_train
	summary_data_df['Num_rows'] = summary_data_df.apply(lambda row: 
					data_df[data_df[label_col] == row[summary_label_col]].shape[0], axis=1)

	# if a row has class as NaN, and all other values as NaN or 0, and comments not None, merge comments to last row
	# and drop that row
	for index, row in summary_data_df.iterrows():
		if row[summary_label_col] == 0 and row.drop(summary_label_col).drop(comments_col).isna().all():  # Values are empty
			last_row = summary_data_df.iloc[index - 1]
			label_lower = row[summary_label_col].lower()
			unknown_label = ('unknown' in label_lower or 'other' in label_lower) \
				and row[summary_label_col] not in data_df[label_col].unique()
			if unknown_label and row[comments_col] and last_row[comments_col]:
				last_row[comments_col] += f'. {row[comments_col]}'  # merge comment with last comment
			summary_data_df.drop(index, inplace=True)
	summary_data_text = summary_data_df.to_csv(index=False).strip()  # convert back summary_data
	return summary_data_text

if final_summary_df is not None:
	summary_data_df = final_summary_df
	summary_data_text = summary_data_df.to_csv(index=False).strip()
else:
	if len(summary_patterns) == 1:
		summary_data_text = summary_patterns.append('')

	while len(summary_patterns) > 1:
		print()
		for attempt in range(attempts_limit):
			try:
				new_summary_patterns = []
				patterns_chunk_count = len(summary_patterns) / patterns_chunk_size
				if patterns_chunk_count % 1:  # decimal value exists
					patterns_chunk_count += 1
				patterns_chunk_count = int(patterns_chunk_count)
				print(f'Number of summaries: {len(summary_patterns)}')
				print(f'Number of chunks: {patterns_chunk_count}')
				for chunk_number in range(patterns_chunk_count):
					if chunk_number % 5 == 0:
						print_progress(chunk_number)
					all_summaries = get_patterns_chunk(chunk_number)
					if not all_summaries:
						continue
					patterns_prompt = summary_prompt_template.format(
						all_summaries=all_summaries, tag=tag, label_column=label_col,
						available_labels=', '.join(data_df[label_col].unique()),
					)
					summary_response = get_lm_response(patterns_prompt)
					summary_data = extract_data(summary_response, tag=tag)
					if not summary_data:
						raise ValueError('No data found in summary response')

					summary_data_text = process_summary_data_df(summary_data)
					new_summary_patterns.append(summary_data_text)
					print_progress()
				summary_patterns = new_summary_patterns
				print()
				break  # finish the attempt
			except Exception as e:
				print(e)
				print_error(e)

	save_final_summary(summary_data_df)

summary_data_df

## F. 	Retrieving Relevant Rows from the Dataset
### Fetching data from `df` to answer a query

In [33]:
dfquery_prompt_template = '''
Data types of the columns:\n{dtypes_data}
\n ---
Summary of the data:
{summary_data}
\n ---
Test data:
{test_df}
\n ---
Act as an Explainable Machine Learning Model.

Create a query by observing patterns for each label in the dataset and the test data.
The query should work on a Python Pandas DataFrame using the df.query() method.
Write a query between tags <{tag}> and </{tag}>.
I will ask to use the query response to predict the label of the test data.
Between tags, don't add any extra text other than query.
Ensure the query is short, simple, and concise and fetches only a few similar rows.

Example response with query:
<{tag}>
	(petal_length > 1.0 and petal_width < 1.0)
</{tag}>

Columns available for querying: `{available_columns}`

It must work with the df.query() method in the Python Pandas library.
The query must be short and must use less filters.
'''.strip()

df_query = None
tag = 'dfquery'
def get_query_result(df_to_test):
	global df_query
	dfquery_prompt = dfquery_prompt_template.format(
		summary_data=summary_data_text, tag=tag, 
		test_df=df_to_test.to_csv(index=False).strip(), 
		available_columns=', '.join(data_df.columns), 
		dtypes_data = str(data_df.dtypes).replace('\n', ' ; '),
	)
	for attempt in range(attempts_limit):
		df_query = None
		try:
			dfquery_response = get_lm_response(dfquery_prompt)  # get query
			df_query = extract_data(dfquery_response, tag=tag)
			df_query = df_query.replace('\n', ' ').strip()
			if len(df_query) > 350:
				print_error()
				continue

			query_result = data_df.query(df_query).copy()
			query_result.reset_index(drop=True, inplace=True)
			if not len(query_result):
				dfquery_prompt += f'\n\n Last query that returned empty response: `{df_query}`'
				print_error()
				continue

			if len(query_result) > query_chunk_size:  # allow only `chunk_size` number of rows
				query_result = query_result.head(query_chunk_size)
			query_result = query_result.to_csv(index=False).strip()
			return query_result
			# break
		except Exception as e:
			print_error(e)
			if attempt == attempts_limit-1:
				raise Exception('Too many attempts to optimize query')
			continue

	raise ValueError('Failed to get result for query')

## G. 	Generating Classifications and Calculating Accuracy

In [34]:
prediction_prompt_template = '''
A sample of the dataset with rows similar to the test data:\n{query_result}
\n ---
Summary of the data (with average values in parentheses):\n{summary_data}
\n ---
Test data:\n{test_data}
\n ---

Act as an Explainable Machine Learning Model.

Use the above data to make a prediction.
Write prediction between tags <{tag}> and </{tag}>, and the reason between <{reason_tag}> and </{reason_tag}>.
Between tags, don't add any extra text other than prediction.
Write a prediction for the test data.
Ignore noise in data and data poisoning attacks, and mention that it is the reason.

Sample response:
<{tag}> Iris-setosa </{tag}>
<{reason_tag}>
	(All rows have SepalLengthCm less than 1.0)
</{reason_tag}>

Available options: `{available_labels}`

Give the same priority to the Summary and Sample of the dataset.
'''.strip()

tag = 'prediction'
reason_tag = 'reason'
tag_title = tag.capitalize()

prediction = None
predictions = []
correct_predictions = 0
failed_indexes = []
error_messages = {}
predictions_by_id = {}

attempts_limit = 3
error_message = None

In [None]:
def get_accuracy(indices = None):
	global prediction, predictions, correct_predictions
	global failed_indexes, error_messages, error_message
	data = test_df.iloc[indices] if indices and len(indices) else test_df
	for index, row in data.iterrows():
		if index % 10 == 0:
			print_progress(index)
		if predictions_by_id.get(index, None) is not None:
			print_progress(predictions_by_id[index])
			continue
		# error_message = None
		for _ in range(attempts_limit):
			try:
				test_row_df = test_df.iloc[index:index+1].drop(label_col, axis=1)
				query_result = get_query_result(test_row_df)
				prediction_prompt = prediction_prompt_template.format(
					tag=tag, reason_tag=reason_tag, summary_data=summary_data_text, df_query=df_query,
					test_data=test_row_df.to_csv(index=False).strip(), query_result=query_result,
					available_labels=', '.join(data_df[label_col].unique()),
				)
				prediction = get_lm_response(prediction_prompt)  # get predictions
				if f'<{tag}>' in prediction:
					prediction = extract_data(prediction, tag=tag)
				elif f'<{tag_title}>' in prediction:
					prediction = extract_data(prediction, tag=tag_title)
				else:
					raise Exception('no tag in response')
				if not prediction:
					raise Exception('empty prediction')
				prediction = prediction.strip(',').strip()

				expected_label = test_df.iloc[index][label_col]
				is_correct = (prediction == expected_label)
				if not is_correct:
					if prediction.lower() == f'class {expected_label}' or \
		 					prediction.lower() == f'class-{expected_label}'  or \
		 					prediction.lower() == f'classification {expected_label}':
						prediction = expected_label
						is_correct = True
				predictions.append(f'{prediction} expected: {expected_label} correct: {is_correct}')
				if is_correct:
					correct_predictions += 1
				predictions_by_id[index] = is_correct
				print_progress(is_correct)
				error_message = None

				break  # finish the attempt
			except Exception as e:
				predictions_by_id[index] = None
				# e = str(e)
				# print('Error:', e)
				print_error(e)
				error_message = str(e)
				continue
		if error_message:
			print_progress(f'F{index} ')
			if index not in failed_indexes:
				failed_indexes.append(index)
			error_messages[index] = error_message

	print('\n correct_predictions:', correct_predictions)
	accuracy = correct_predictions / len(test_df)
	return accuracy, failed_indexes, error_messages

test_rows = len(test_df)
train_rows = len(data_df)
test_cols = len(test_df.columns)
print(f'{model} - {dataset_name} - {train_rows} train rows & {test_rows} test rows * {test_cols} columns')

accuracy, failed_indexes, error_messages = get_accuracy()
print(f'\n Accuracy: {accuracy:.2%}')
if len(failed_indexes):
	print(f'Retrying for failed indices: {failed_indexes}')
	accuracy, failed_indexes, error_messages = get_accuracy(failed_indexes)
	if len(failed_indexes):
		print(f'Failed again for indices: {failed_indexes}')
	print(f'\n Final Accuracy: {accuracy:.2%}')

result_file = 'results.txt'  # Save the result
if not os.path.exists(result_file) or not os.stat(result_file).st_size:
	with open(result_file, 'w') as file:
		file.write('Model - Dataset - Train Rows & Test Rows - Accuracy\n')

result_line = f'{model} - {dataset_name} - {train_rows} & {test_rows} * {test_cols} cols - {accuracy:.2%}\n'
with open(result_file, 'a') as file:
	file.write(result_line)

print(result_line)
print(f'Prediction count: {len(predictions)}')
predictions