<a href="https://colab.research.google.com/github/PuchToTalk/FinBERT/blob/fine-tuning/LlamaIndex_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install llama_index
!pip install pypdf


Collecting pypdf
  Downloading pypdf-3.16.1-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.3/276.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.16.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/Stanford_Research/Work/finBERT-convex')



# **TUTO FINE-TUNING EXAMPLE WITH LLAMAINDEX**

In [36]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode

In [37]:
TRAIN_FILES = ["lyft_2021.pdf"]
VAL_FILES = ["uber_2021.pdf"]

TRAIN_CORPUS_FPATH = "train_corpus.json"
VAL_CORPUS_FPATH = "val_corpus.json"

In [38]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [39]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['lyft_2021.pdf']
Loaded 144 docs


Parsing documents into nodes:   0%|          | 0/144 [00:00<?, ?it/s]

Parsed 225 nodes
Loading files ['uber_2021.pdf']
Loaded 160 docs


Parsing documents into nodes:   0%|          | 0/160 [00:00<?, ?it/s]

Parsed 842 nodes


In [41]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
train_dataset = generate_qa_embedding_pairs(train_nodes)
val_dataset = generate_qa_embedding_pairs(val_nodes)

#train_dataset.save_json("train_dataset.json")
#val_dataset.save_json("val_dataset.json")



# **VERSION ON FINBERT FINANCE DATASETS**

In [34]:
TRAIN_FILES = ["zetrain.csv"]
VAL_FILES = ["zevalidation.csv"]

TRAIN_CORPUS_FPATH = "zetrain.csv"
VAL_CORPUS_FPATH = "zevalidation.csv"

In [35]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['zetrain.csv']


ParserError: ignored

In [45]:
# Step 1: Install LlamaIndex if not already installed
# pip install llama_index

# Step 2: Import necessary modules
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings import resolve_embed_model
from llama_index.finetuning import generate_qa_embedding_pairs
import pandas as pd
import torch
import torch.nn as nn

# Step 3: Load your dataset or corpus from a CSV file
# Replace 'data.csv' with the path to your CSV file
data = pd.read_csv('zetrain.csv', delimiter=';')
val = pd.read_csv('zevalidation.csv', delimiter=';')



# Assuming your CSV has columns named 'text' and 'labels', adjust as needed
text_data = data['text'].tolist()
labels_data = data['label'].tolist()

text_val = val['text'].tolist()
labels_val = val['label'].tolist()

# Create a list of (text, label) pairs for fine-tuning
qa_pairs = [(text, label) for text, label in zip(text_data, labels_data)]

qa_pairs2 = [(text2, label2) for text2, label2 in zip(text_val, labels_val)]

qa_df = pd.DataFrame(qa_pairs, columns=['question', 'context'])
qa_val = pd.DataFrame(qa_pairs2, columns=['question', 'context'])

# Convert the DataFrame to a list of dictionaries
qa_data = qa_df.to_dict(orient='records')
qa_val = qa_val.to_dict(orient='records')

train_size = int(0.8 * len(qa_data))

# Generate QA embedding pairs
#train_dataset = generate_qa_embedding_pairs(qa_data[:train_size])
#val_dataset = generate_qa_embedding_pairs(qa_data[train_size:])




In [50]:
import json

# Save qa_data to a JSON file
with open("train_dataset.json", "w") as train_json_file:
    json.dump({"data": qa_data}, train_json_file)

# Save qa_val to a JSON file
with open("val_dataset.json", "w") as val_json_file:
    json.dump({"data": qa_val}, val_json_file)


In [51]:
import json

# Load train_dataset from JSON
with open("train_dataset.json", "r") as train_json_file:
    train_data_dict = json.load(train_json_file)
    train_dataset = train_data_dict["data"]

# Load val_dataset from JSON
with open("val_dataset.json", "r") as val_json_file:
    val_data_dict = json.load(val_json_file)
    val_dataset = val_data_dict["data"]


In [55]:
# requires torch dependency
from llama_index.embeddings.adapter_utils import TwoLayerNN

from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings import resolve_embed_model
from llama_index.embeddings import AdapterEmbeddingModel

Fine-tuning

In [58]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_

In [59]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings import resolve_embed_model
import torch

base_embed_model = resolve_embed_model("local:BAAI/bge-small-en")

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="model_output_test",
    # bias=True,
    epochs=4,
    verbose=True,
    # optimizer_class=torch.optim.SGD,
    # optimizer_params={"lr": 0.01}
)
finetune_engine.finetune()
embed_model = finetune_engine.get_finetuned_model()

Downloading (…)2ee0e/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)6ec182ee0e/README.md:   0%|          | 0.00/89.0k [00:00<?, ?B/s]

Downloading (…)c182ee0e/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)2ee0e/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)6ec182ee0e/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)182ee0e/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

AttributeError: ignored

In [None]:
# Step 4: Define the base embedding model
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en")

# Step 5: Define your custom convex neural network adapter architecture
class CustomConvexNNAdapter(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CustomConvexNNAdapter, self).__init__()
        self.fc1 = nn.Linear(input_dim, output_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(output_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Step 6: Fine-tune the adapter with the custom architecture
finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="model_output_test",
    model_checkpoint_path="model_checkpoint",
    adapter_model=CustomConvexNNAdapter(input_dim=384, output_dim=384),  # Adjust dimensions as needed
    epochs=4,
    verbose=True,
)
finetune_engine.finetune()

# Step 7: Retrieve the fine-tuned model
embed_model = finetune_engine.get_finetuned_model(adapter_cls=CustomConvexNNAdapter)  # Use CustomConvexNNAdapter here

# Step 8: Use the fine-tuned model for evaluation or downstream tasks
# You can now evaluate the model as needed

Evaluation

In [None]:
# load model from checkpoint in the midde
embed_model_2layer = AdapterEmbeddingModel(
    base_embed_model,
    "model5_output_test",
    TwoLayerNN,
)

In [None]:
from eval_utils import evaluate, display_results
ft_val_results_2layer = evaluate(val_dataset, embed_model_2layer)


In [None]:
# comment out if you haven't run ada/bge yet
display_results(
    ["ada", "bge", "ft_2layer"],
    [ada_val_results, bge_val_results, ft_val_results_2layer],
)

# uncomment if you just want to display the fine-tuned model's results
# display_results(["ft_2layer"], [ft_val_results_2layer])

In [None]:
# load model from checkpoint in the midde
embed_model_2layer_s900 = AdapterEmbeddingModel(
    base_embed_model,
    "model5_ck/step_900",
    TwoLayerNN,
)

In [None]:
ft_val_results_2layer_s900 = evaluate(val_dataset, embed_model_2layer_s900)


In [None]:
# comment out if you haven't run ada/bge yet
display_results(
    ["ada", "bge", "ft_2layer_s900"],
    [ada_val_results, bge_val_results, ft_val_results_2layer_s900],
)

# uncomment if you just want to display the fine-tuned model's results
# display_results(["ft_2layer_s900"], [ft_val_results_2layer_s900])