# Environment Setup

In [1]:
! pip install predibase
! pip install datasets

Collecting predibase
  Downloading predibase-2024.8.4.tar.gz (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.3/87.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dataclasses-json==0.5.7 (from predibase)
  Downloading dataclasses_json-0.5.7-py3-none-any.whl.metadata (22 kB)
Collecting deprecation (from predibase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting ipyplot (from predibase)
  Downloading ipyplot-1.1.2-py3-none-any.whl.metadata (7.2 kB)
Collecting predibase-api==2024.8.4 (from predibase)
  Downloading predibase-api-2024.8.4.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting progress-table==0.1.26 (from predibase)
  Downloading progress-table-0.1.26.tar.gz (12 kB)
  Installing build dependencies ... [?25l[?

In [2]:
from predibase import Predibase, FinetuningConfig, DeploymentConfig
from datasets import Dataset, load_dataset

pb = Predibase(api_token = 'INPUT_PREDIBASE_API_KEY_HERE')

# Solar LLM (solar-1-mini-chat-240612) Fine-tuning

In [5]:
! pip install pandas



In [7]:
sample_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below is a user question, paired with retrieved context. Write a response that appropriately answers the question,
include specific details in your response. <|eot_id|>

<|start_header_id|>user<|end_header_id|>

### Question:
{}

### Context:
{}

<|eot_id|>

### Response: <|start_header_id|>assistant<|end_header_id|>
{}"""


# Function for formatting above prompt with information from Financial QA dataset
def formatting_prompts_func(examples):
    questions = examples["question"]
    contexts       = examples["context"]
    responses      = examples["answer"]
    texts = []
    for question, context, response in zip(questions, contexts, responses):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = sample_prompt.format(question, context, response)
        texts.append(text)
    return { "text" : texts, }
pass

dataset = load_dataset("virattt/financial-qa-10K", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

# Function to rename columns
def rename_columns(example):
    example['prompt'] = example.pop('question')
    example['completion'] = example.pop('answer')
    return example

# Apply the renaming
dataset = dataset.map(rename_columns)

# Upload a dataset
dataset = pb.datasets.from_pandas_dataframe(dataset, name="financial-qa-10K")

Creating CSV from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

In [8]:
# Create an adapter repository
repo = pb.repos.create(name="sec-10-k-chatbot", description="solar llm finetuning for creating a sec-10-k-chatbot", exists_ok=True)

In [9]:
# Create an adapter
adapter = pb.adapters.create(
   config=FinetuningConfig(
       base_model="solar-1-mini-chat-240612",
       epochs=10,
       rank=16,
       learning_rate=0.0002,
   ),
   dataset=dataset,
   repo=repo,
   description="solar llm finetuning for creating a sec-10-k-chatbot"
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `sec-10-k-chatbot/1`. (Job UUID: b1e64fa1-e42a-4af7-850d-b67267195956).

Watching progress of finetuning job b1e64fa1-e42a-4af7-850d-b67267195956. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.

Job is starting. Total queue time: 0:05:47         
Waiting to receive training metrics...

┌────────────┬────────────┬─────────────────┐
│ checkpoint [0m│ train_loss [0m│ validation_loss [0m│
├────────────┼────────────┼─────────────────┤
│     1      [0m│   1.0840   [0m│        --       [0m│
│     2      [0m│   1.2849   [0m│        --       [0m│
│     3      [0m│   1.0287   [0m│        --       [0m│
│     4      [0m│   1.1419   [0m│        --       [0m│
│     5      [0m│   0.9008   [0m│        --       [0m│
│     6      [0m│   0.8630   [0m│        --       [0m│
└────────────┴────────────┴─────────────────┘


In [10]:
# Get adapter info after training
pb.adapters.get("sec-10-k-chatbot/1")

Adapter(repo='sec-10-k-chatbot', tag=1, archived=False, base_model='solar-1-mini-chat-240612', description='solar llm finetuning for creating a sec-10-k-chatbot', artifact_path='b1e64fa1-e42a-4af7-850d-b67267195956/60940739f42a41ebb5a9a5df17f287ef/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='b1e64fa1-e42a-4af7-850d-b67267195956')

In [11]:
adapter

Adapter(repo='sec-10-k-chatbot', tag=1, archived=False, base_model='solar-1-mini-chat-240612', description='solar llm finetuning for creating a sec-10-k-chatbot', artifact_path='b1e64fa1-e42a-4af7-850d-b67267195956/60940739f42a41ebb5a9a5df17f287ef/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='b1e64fa1-e42a-4af7-850d-b67267195956')

In [12]:
adapter_id = adapter.repo + "/" + str(adapter.tag)
adapter_id

'sec-10-k-chatbot/1'

# Run Inference on the Fine-tuned Adapter

In [13]:
text = "What were the primary drivers of the notable increase in research and development expenses from Apple for fiscal year 2023?"
# Prompt the adapter to generate response via serverless endpoints
print(lorax_client.generate(text, adapter_id="sec-10-k-chatbot/1").generated_text)

The primary drivers of the 28% increase in research and development expenses for fiscal year 2023 were higher employee-related costs due to increased headcount and higher performance-based bonuses, as well as an increase in engineering project costs and data center infrastructure.


# Appendix: Data Preprocessing

In [None]:
import pandas as pd

# Import the original dataset
df = pd.read_csv("customer_complaint_demo_data_raw.csv")
df = df.rename(columns={"Consumer complaint narrative": "Complaint", "Structured JSON Output": "completion"})
df.head()

In [None]:
tpl = """You are a support agent for a public financial company and a customer has raised a complaint.
    Generate a structured JSON output with the following fields "product", "issue", and "generatedCompanyResponse".

    Here is an example structure:
    {{
      "product": "...",
      "issue": "...",
      "generatedCompanyResponse": "..."
    }}

    The value for "generatedCompanyResponse" should be a polite response to the following complaint.

    ### Complaint: {Complaint}

    ### Structured JSON Output:"""

output_column_name = "completion"

In [None]:
# process the data to fit in the prompt teamplate
new_df = pd.DataFrame()
prompts = []
for index, row in df.iterrows():
    prompt = tpl.format(**row.to_dict())
    prompts.append(prompt)
new_df["prompt"] = prompts
new_df["completion"] = df[output_column_name]

print(new_df.head())

                                              prompt  \
0  You are a support agent for a public financial...   
1  You are a support agent for a public financial...   
2  You are a support agent for a public financial...   
3  You are a support agent for a public financial...   
4  You are a support agent for a public financial...   

                                          completion  
0  {\n"product": "Student loan",\n"issue": "Strug...  
1  {\n"product": "Credit reporting",\n"issue": "I...  
2  {\n"product": "Credit card or prepaid card",\n...  
3  {\n"product": "Credit card",\n"issue": "Payoff...  
4  {\n"product": "Money transfer, virtual currenc...  
