# Causation Corpus classification 
#### Chain of Thoughts Self Consistency

This notebook uses the chain-of-thoughts self consistency prompting technique to classify sentences via LLMs.

## OpenAI Privacy Policy
This notebook uses OpenAI's API, meaning that your data will be sent to the OpenAI servers.

For concerns about how your data will be handled, please read through the Privacy Policy [here](https://openai.com/policies/api-data-usage-policies).

## 1. Supply API Key

In [None]:
from causation.utils import openai_apikey_input

openai_apikey_input()

## 2. Upload Prompt

In [None]:
from causation.utils import fileuploader 

finput, exemplars = fileuploader('.toml')
finput

In [None]:
# encoding issue - 27.Sep.23 - this strips away the whitespace/end-of-line character at the end of each line.
# it's okay to always run this cell with or without the encoding issue.
!sed 's/" $/"/g' "{exemplars.get('data').absolute()}" > "{exemplars.get('data').absolute()}.formatted"
!mv "{exemplars.get('data').absolute()}.formatted" "{exemplars.get('data').absolute()}"

In [None]:
assert exemplars.get('data'), "Did you upload your CoT examples in the previous cell?"

from llm_experiments import CoT

cot = CoT.from_toml(exemplars.get('data'))
cot.shuffle_examples()                                         # improves result
f"These CoT example class distributions: {cot.class_dist()}"   # try to keep this balanced.

In [None]:
# [Optional] - randomly sample X CoT examples to reduce input tokens.
cot.sample(method='random', n=len(cot.examples))
cot.class_dist()

## 3. Configure Model

In [None]:
from llm_experiments import SamplingScheme

sampling_scheme = SamplingScheme(top_p=0.8, temperature=1, presence_penalty=0.0)
n_completions = 3

assert n_completions > 1, "For the model to generate > 1 possibilities needed for self-consistency, n_completions must be > 1."
assert sampling_scheme.temperature > 0, "For the model to generate > 1 possibilities needed for self-consistency, temperature must be > 0."
sampling_scheme

In [None]:
from llm_experiments import CoTSC

cotsc = CoTSC.from_cot(model='gpt-3.5-turbo',  # for a larger context window (from 4k -> 16k tokens) replace with 'gpt-3.5-turbo-16k'
                       cot=cot,
                       sampling_scheme=sampling_scheme,
                       n_completions=n_completions)
f"{cotsc.model}   'temperature': {cotsc.llm.temperature}, {str(cotsc.llm.model_kwargs).lstrip('{').rstrip('}')}"

In [None]:
prompt, num_tokens = cotsc.dryrun(query="Canberra immunologist Carola Vinuesa who discovered a gene responsible for the autoimmune diseases lupus and diabetes.")
f"This is a test run. Number of tokens in the above prompt: {num_tokens}."

## 4. Upload your dataset

In [None]:
from causation.utils import fileuploader

finput, dataset = fileuploader('.xlsx')
finput

In [None]:
import pandas as pd
assert dataset.get('data'), "Did you upload your dataset?"
df = pd.read_excel(dataset.get('data'))
df.head(1)

In [None]:
assert 'sentence' in df.columns, "Missing 'sentence' column in your dataset."

In [None]:
f"Number of examples found: {len(df)}. Please continue."

## 5. Set up TikDollar (default cost_threshold is set to $1.0)

In [None]:
from llm_experiments.utils import TikDollar as td

# ⚠️ Caveat: When you rerun this cell, tikdollar is reset to 0!
tikdollar = td.track(cotsc, cotsc._tikdollar_run, cost_threshold=1.0, raise_err=True, verbose=False)
tikdollar  # starts out with zero cost accumulated.

## 6. Run Classification

In [None]:
import numpy as np
from tqdm.auto import tqdm

from llm_experiments.cot import CoT, CoTSC
from llm_experiments.cot.cot import CoTDataLeakException
from llm_experiments.utils.tikdollar import CostThresholdReachedException

checkpointing = 200  # save after every {checkpointing} queries.
print(f"Results will be saved every {checkpointing} queries.")

VOTE_STR = 'vote[{clazz}]'
REASON_STR = 'reason[{clazz}]'

# setup dataframe.
results_df = df.loc[:, 'sentence'].copy(deep=True)
results_df = pd.DataFrame(results_df, columns=['sentence'])
for clazz in cotsc.classes:
    results_df[VOTE_STR.format(clazz=clazz)] = 0
    results_df[REASON_STR.format(clazz=clazz)] = ''
results_df = results_df.sort_index(axis=1, ascending=False)
results_df = results_df[['sentence'] + [col for col in results_df.columns if col != 'sentence']]
results_df['raw_output'] = ''

dleak_counter = 0
for i, sent in tqdm(enumerate(results_df.loc[:, 'sentence']), total=len(results_df)):
    try:
        results = cotsc.run(query=sent)
        for clazz, clz_results in results.items():
            results_df.loc[i, VOTE_STR.format(clazz=clazz)] = clz_results.get('votes')
            results_df.loc[i, REASON_STR.format(clazz=clazz)] = "\n".join(clz_results.get('steps'))
            results_df.loc[i, 'raw_output'] = "\n".join(clz_results.get('completions'))
    except CoTDataLeakException as cotdle:
        # todo: make these prints alerts.
        print(cotdle)
        print("Data leak detected. Skipped.")
        dleak_counter += 1
        continue
    except CostThresholdReachedException as ctre:
        print(ctre)
        print(f"Number of queries sent: {i}.")
        break
    except Exception as e:
        print(e)

    if checkpointing and (i + 1) % checkpointing == 0:
        path = f'./cotsc-outputs-checkpoint-{i + 1}.xlsx'
        results_df.to_excel(path)
        print(f"Checkpointed at {i + 1} queries processed. Checkpoint file: {path}.")
        
print(f"Number of examples leaked: {dleak_counter}. Please continue.")

In [None]:
# some postprocessing.
import re
results_df['majority'] = results_df.filter(regex=r'vote*').idxmax(axis=1).apply(lambda col_name: re.search(r'\[(.*?)\]', col_name).group(1))
results_df = results_df[['sentence', 'majority'] + [col for col in results_df.columns if col not in ('sentence', 'majority')]]

In [None]:
from IPython.core.display import HTML

print("Please continue.")

if input("Display results? (y/n): ").lower() == 'y':
    display(HTML(results_df.to_html()))

## 7. Download - run the following cells in sequence.

In [None]:
from pathlib import Path
from datetime import datetime
import srsly

now = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
output_dir = Path(f"./.cotsc-corpus-output-{now}")
output_dir.mkdir(exist_ok=False)

# 1. cotsc-output.xlsx
results_df.to_excel(output_dir.joinpath('cotsc-output.xlsx'))
# 2. model config
path = output_dir.joinpath('cotsc-config.json')
cotsc_config = {
    'sampling_scheme': sampling_scheme.openai(),
    'n_completions': cotsc.n_completions,
    'model': cotsc.model,
    'classes': cotsc.classes,
}
srsly.write_json(path, cotsc_config)

In [None]:
# 3. dataset
# 4. toml

file_names = [exemplars['data'], dataset['data']]  # toml & dataset
file_names += list(output_dir.glob("*"))
file_names

In [None]:
import zipfile
import os
from datetime import datetime
from pathlib import Path
import panel as pn

zfname = Path(f'{now}-cotsc-corpus.zip')
with zipfile.ZipFile(zfname, 'w') as zipf:
    for file_name in file_names:
        zipf.write(file_name, arcname=os.path.basename(file_name))
print(f"Saved as {zfname}.\nClick below to download.")

# download link for the zip.
pn.widgets.FileDownload(file=str(zfname), filename=zfname.name)