In [2]:
# !pip install weaviate-client
# !pip install llama_index==0.9.14
# !pip install llama_index==0.9.24
# !pip install openai
# !pip install --upgrade pydantic==1.10.12 typing-extensions==4.5.0
# !pip install fastcore
# !pip install typing-extensions==4.7

In [2]:
import os
from configparser import ConfigParser
import pandas as pd
import numpy as np

from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

from llama_index.schema import TextNode
from llama_index.llms import OpenAI as OpenAILLama
from llama_index.finetuning import SentenceTransformersFinetuneEngine

pd.set_option('display.max_colwidth',-1)

  pd.set_option('display.max_colwidth',-1)


In [None]:
#source: https://docs.llamaindex.ai/en/stable/examples/finetuning/embeddings/finetune_embedding.html

## Load conf, LLM

In [3]:
config=ConfigParser()
config.read('conf/conf.ini')
os.environ["OPENAI_API_KEY"] = config['openai']['apikey']

In [4]:
llm = OpenAILLama(model="gpt-3.5-turbo", temperature=0.1)

# Data

## Load data

In [5]:
df=pd.read_excel('data/riigikogu_w_meta/data_all.xlsx')
df.shape

(410335, 23)

In [6]:
df.head()

Unnamed: 0,heading,speaker,index_pk,index_snd,year,month,day,time,text_wo_punct,date,...,ntoks_splitted,doc_id,topic_id,topic_prob,Name,cluster_name,cluster_name_very_high,topic_reduce_outliers,Name_red_outliers,fation
0,15:00 Istungi rakendamine,Esimees Ene Ergma,PKP-18479,SND-439218,2010,1,11,15:00,tere päevast lugupeetud riigikogu ilusat jätkuvat aastat teile kõigile alustame riigikogu täiskogu vii istungjärgu esimese töönädala esmaspäevast istungit,2010-01-11,...,18,154548,574,0.402741,574_töönädala_istungjärgu_istungit_alustame,istungjärgu_töönädala_arupärimisi_istungit_tere,palun_lugemine_juhtivkomisjoni_läbirääkimisi_kõnesoove,574,574_töönädala_istungjärgu_istungit_alustame,
1,1.\n 15:01 Riigikogu liikme Katrin Karisma-Krummi ametivanne,Esimees Ene Ergma,PKP-18480,SND-439219,2010,1,11,15:01,head kolleegid palun tähelepanu seoses riigikogu liikme ülle rajasalu volituste lõppemisega tema nimetamise tõttu harju maavanemaks asus alates eelmise aasta 21. detsembrist riigikogu liikmeks asendusliige katrin karisma-krumm lugupeetud riigikogu meil on nüüd meeldiv võimalus ära kuulata riigikogu liikme katrin karisma-krummi ametivanne palun,2010-01-11,...,42,154549,448,1.0,448_ametivande_ametivanne_liikmeks_tagasiastumisega,ametivande_ametivanne_kuulame_liikmeks_tagasiastumisega,aastal_eurot_euroopa_aasta_kui,448,448_ametivande_ametivanne_liikmeks_tagasiastumisega,
2,1.\n 15:01 Riigikogu liikme Katrin Karisma-Krummi ametivanne,Katrin Karisma-Krumm,PKP-18480,SND-439220,2010,1,11,15:01,asudes täitma oma kohustusi riigikogu liikmena riigikogu xi koosseisus annan vande jääda ustavaks eesti vabariigile ja tema põhiseaduslikule korrale aplaus,2010-01-11,...,20,154550,1343,1.0,1343_ustavaks_vande_vabariigile_asudes,ustavaks_põhiseaduslikule_asudes_vabariigile_korrale,palun_lugemine_juhtivkomisjoni_läbirääkimisi_kõnesoove,1343,1343_ustavaks_vande_vabariigile_asudes,Eesti Reformierakonna fraktsioon
3,15:02 Istungi rakendamine,Esimees Ene Ergma,PKP-18481,SND-439223,2010,1,11,15:02,aitäh palun kolleeg mailis reps,2010-01-11,...,5,154555,-1,0.0,-1_ma_te_me_et,-1_ma_te_me_et,-1_ma_te_me_et,1262,1262_reps_mailis_teine_küsimus,
4,15:02 Istungi rakendamine,Esimees Ene Ergma,PKP-18481,SND-439225,2010,1,11,15:02,aitäh palun kolleeg eiki nestor,2010-01-11,...,5,154558,331,0.87407,331_nestor_eiki_küsimuseks_protseduuriline,nestor_eiki_kolleeg_palun_küsimuseks,palun_lugemine_juhtivkomisjoni_läbirääkimisi_kõnesoove,331,331_nestor_eiki_küsimuseks_protseduuriline,


In [7]:
df.year.value_counts()

2022    50569
2016    34844
2017    34692
2021    34099
2020    33611
2010    31222
2018    29045
2014    27792
2012    25102
2023    25000
2015    24340
2011    21893
2019    21078
2013    17048
Name: year, dtype: int64

In [8]:
embs_ar=np.load('data/riigikogu_w_meta/embs_all.npy', allow_pickle=True)
embs_ar.shape

(410335,)

## Let's keep meaningful topics?

In [9]:
df_topic_info=pd.read_excel('data/riigikogu_w_meta/topics_info2022_manual_review.xlsx')
df_topic_info.shape

(2107, 9)

In [10]:
meaningful_topics=df_topic_info[df_topic_info.manual_cluster=='sisukas'].Name.tolist()
len(meaningful_topics)

1159

In [11]:
df[df.Name_red_outliers=='-1_ma_te_me_et'].shape

(96704, 23)

In [12]:
df[(df.Name_red_outliers=='-1_ma_te_me_et')&(df.ntoks_splitted>6)].shape

(95108, 23)

## Let's use year 2023 as test year and 2022 as train year

In [13]:
# df=df[df.Name_red_outliers.isin(meaningful_topics)]
df=df[df.Name_red_outliers.isin(['-1_ma_te_me_et'])&(df.ntoks_splitted>6)]
df.shape

(95108, 23)

In [14]:
#remove duplicated text
df=df[~df.text_splitted.duplicated()]
df.shape

(95106, 23)

In [15]:
# df[df.year==2023].Name_red_outliers.value_counts().sum()

In [16]:
# df[df.year==2022].Name_red_outliers.value_counts().sum()

In [17]:
# idx_train=df[(df.year==2022)].sample(1000, random_state=43).index
# idx_val=df[(df.year==2023)].sample(300, random_state=43).index

idx_train=df.sample(1000, random_state=43).index
idx_val=df[~df.index.isin(idx_train)].sample(300, random_state=43).index

texts_train=df.loc[idx_train.tolist()].text_splitted.tolist()
texts_val=df.loc[idx_val.tolist()].text_splitted.tolist()
embs_train=embs_ar[idx_train]
embs_val=embs_ar[idx_val]

len(texts_train), len(texts_val), embs_train.shape, embs_val.shape

(2000, 300, (2000,), (300,))

In [18]:
len(set(idx_train)&set(idx_val))

0

## Create nodes

In [19]:
def create_nodes(texts, embs_ar):
    nodes = []
    for idx, sample in enumerate(texts):
        metadata=dict()
        node = TextNode(
            text=sample,
        )
        node.embedding=embs_ar[idx]
        
        nodes.append(node)
    return nodes

nodes_train=create_nodes(texts_train, embs_train)
nodes_val=create_nodes(texts_val, embs_val)
len(nodes_train), len(nodes_val)

(2000, 300)

## Generate synthetic queries

In [20]:
qa_generate_prompt_tmpl = 'Context information is below.\n\n---------------------\n{context_str}\n---------------------\n\nGiven the context information and not prior knowledge.\ngenerate only questions based on the below query.\n\nYou are a Teacher/ Professor. Your task is to setup {num_questions_per_chunk} questions for an upcoming quiz/examination. The questions should be diverse in nature across the document. Restrict the questions to the context information provided. Please create questions only in Estonian."\n'

In [21]:
# train_dataset = generate_qa_embedding_pairs(nodes_train, llm, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl)
# val_dataset = generate_qa_embedding_pairs(nodes_val, llm, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl)

In [22]:
# train_dataset.save_json("data/riigikogu_w_meta/embedding_finetune/train_dataset.json")
# val_dataset.save_json("data/riigikogu_w_meta/embedding_finetune/val_dataset.json")

In [23]:
train_dataset = generate_qa_embedding_pairs(nodes_train, llm, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl)

 46%|████▌     | 920/2000 [1:24:43<1:39:27,  5.53s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
val_dataset = generate_qa_embedding_pairs(nodes_val, llm, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl)

In [None]:
# train_dataset.save_json("data/riigikogu_w_meta/embedding_finetune/train_dataset.json")

In [None]:
# train_dataset

## Save data

In [None]:
train_dataset.save_json("data/riigikogu_w_meta/embedding_finetune/train_dataset.json")
val_dataset.save_json("data/riigikogu_w_meta/embedding_finetune/val_dataset.json")

# Train

## Load data

In [None]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("data/riigikogu_w_meta/embedding_finetune/train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("data/riigikogu_w_meta/embedding_finetune/val_dataset.json")

## Run embedding finetuning

In [None]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="intfloat/multilingual-e5-base",
    model_output_path="test_model",
    val_dataset=val_dataset,
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model = finetune_engine.get_finetuned_model()

## Evaluate model

In [None]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

Define eval function
Option 1: We use a simple hit rate metric for evaluation:

for each (query, relevant_doc) pair,

we retrieve top-k documents with the query, and

it’s a hit if the results contain the relevant_doc.

This approach is very simple and intuitive, and we can apply it to both the proprietary OpenAI embedding as well as our open source and fine-tuned embedding models.

In [None]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

Option 2: We use the InformationRetrievalEvaluator from sentence_transformers.

This provides a more comprehensive suite of metrics, but we can only run it against the sentencetransformers compatible models (open source and our finetuned model, not the OpenAI embedding model).

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

## Eval openAi

In [None]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

In [None]:
df_ada = pd.DataFrame(ada_val_results)

In [None]:
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

## Local model

In [None]:
bge = "local:intfloat/multilingual-e5-base"
bge_val_results = evaluate(val_dataset, bge)

In [None]:
df_bge = pd.DataFrame(bge_val_results)

In [None]:
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge

In [None]:
evaluate_st(val_dataset, "intfloat/multilingual-e5-base", name="e5")

## Finetuned models

In [None]:
finetuned = "local:test_model"
val_results_finetuned = evaluate(val_dataset, finetuned)

In [None]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [None]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

In [None]:
df_finetuned["is_hit"].value_counts(normalize=True)

In [None]:
evaluate_st(val_dataset, "test_model", name="finetuned")

In [None]:
df_res_e5=pd.read_csv('results/Information-Retrieval_evaluation_e5_results.csv')
df_res_fine_tune=pd.read_csv('results/Information-Retrieval_evaluation_finetuned_results.csv')
df_res_e5['model'] = 'e5'
df_res_fine_tune['model'] = 'fine_tuned'
df_st_all = pd.concat([df_res_e5, df_res_fine_tune])
df_st_all = df_st_all.set_index('model')
df_st_all

## Review some texts

In [None]:
df_finetuned

In [None]:
df_finetuned[~df_finetuned.is_hit]

In [None]:
#expected
[n for n in nodes_val if n.id_=='dbcf0086-bef5-4ff9-8b0a-7099f1ed52d4'][0].text

In [None]:
#retrieved
[n for n in nodes_val if n.id_=='5a245ed8-716e-4b8e-9ec4-16c27c16daee'][0].text

In [None]:
#retrieved
[n for n in nodes_val if n.id_=='b779c5d5-124b-4f36-9beb-900c0cbe2fe0'][0].text

## Test finetuned model

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
model_fintuned = SentenceTransformer('test_model')

In [None]:
model_orig=SentenceTransformer("intfloat/multilingual-e5-base")

In [None]:
df.speaker.value_counts()[:50]

In [None]:
speakers2keep=['Rahandusminister Jürgen Ligi', 'Peaminister Taavi Rõivas', 'Rahandusminister Sven Sester', 
               'Majandus- ja kommunikatsiooniminister Juhan Parts', 'Henn Põlluaas']

In [None]:
df[df.speaker.isin(speakers2keep)].shape

In [None]:
df_sample=df[df.speaker.isin(speakers2keep)]
df_sample.speaker.value_counts()

#### Train, test data

In [None]:
texts=df_sample.text_splitted.tolist()
labels=df_sample.speaker.tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
embs_train_orig=model_orig.encode(X_train)
embs_train_finet=model_fintuned.encode(X_train)

In [None]:
embs_test_orig=model_orig.encode(X_test)
embs_test_finet=model_fintuned.encode(X_test)

#### Train model originial model

In [None]:
clf_orig=LinearSVC()
clf_orig.fit(embs_train_orig, y_train)

In [None]:
# predict eval data
pred_labels = clf_orig.predict(embs_test_orig)
print(classification_report(y_test, pred_labels))

#### Train finetuned model

In [None]:
clf_finet=LinearSVC()
clf_finet.fit(embs_train_finet, y_train)

In [None]:
# predict eval data
pred_labels = clf_finet.predict(embs_test_finet)
print(classification_report(y_test, pred_labels))

## Fine-tune a Two-Layer Adapter

In [None]:
#source: https://docs.llamaindex.ai/en/stable/examples/finetuning/embeddings/finetune_embedding_adapter.html

In [None]:
# requires torch dependency
from llama_index.embeddings.adapter_utils import TwoLayerNN

from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings import resolve_embed_model
from llama_index.embeddings import AdapterEmbeddingModel

In [None]:
base_embed_model = resolve_embed_model("local:intfloat/multilingual-e5-base")

In [None]:
adapter_model = TwoLayerNN(
    768,  # input dimension
    1024,  # hidden dimension
    768,  # output dimension
    bias=True,
    add_residual=True,
)

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="modele5_output_test",
    model_checkpoint_path="model5_ck",
    adapter_model=adapter_model,
    epochs=25,
    verbose=True,
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model_2layer = finetune_engine.get_finetuned_model(
    adapter_cls=TwoLayerNN
)

### Evaluate

In [None]:
# load model from checkpoint in the midde
embed_model_2layer = AdapterEmbeddingModel(
    base_embed_model,
    "modele5_output_test",
    TwoLayerNN,
)

In [None]:
ft_val_results_2layer = evaluate(val_dataset, embed_model_2layer)

In [None]:
pd.DataFrame(ft_val_results_2layer).is_hit.value_counts(normalize=True)

In [None]:
pd.DataFrame(ft_val_results_2layer).is_hit.mean()

## Use this model in training

In [None]:
# embed_model_2layer.get_text_embedding(['see'])

In [None]:
# embs_train_orig=model_orig.encode(X_train)
embs_train_finet_cust=[embed_model_2layer.get_text_embedding(t) for t in X_train]

In [None]:
# embs_test_orig=model_orig.encode(X_test)
embs_test_finet_cust=[embed_model_2layer.get_text_embedding(t) for t in X_test]

In [None]:
clf_finet_cust=LinearSVC()
clf_finet_cust.fit(embs_train_finet_cust, y_train)

In [None]:
pred_labels = clf_finet_cust.predict(embs_test_finet_cust)
print(classification_report(y_test, pred_labels))

## Custom adapter layer

In [None]:
from llama_index.embeddings.adapter_utils import BaseAdapter
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from typing import Dict

In [None]:
class CustomNN(BaseAdapter):
    """Custom NN transformation.

    Is a copy of our TwoLayerNN, showing it here for notebook purposes.

    Args:
        in_features (int): Input dimension.
        hidden_features (int): Hidden dimension.
        out_features (int): Output dimension.
        bias (bool): Whether to use bias. Defaults to False.
        activation_fn_str (str): Name of activation function. Defaults to "relu".

    """

    def __init__(
        self,
        in_features: int,
        hidden_features: int,
        out_features: int,
        bias: bool = False,
        add_residual: bool = False,
    ) -> None:
        super(CustomNN, self).__init__()
        self.in_features = in_features
        self.hidden_features = hidden_features
        self.out_features = out_features
        self.bias = bias

        self.linear1 = nn.Linear(in_features, hidden_features, bias=True)
        self.linear2 = nn.Linear(hidden_features, out_features, bias=True)
        self._add_residual = add_residual
        # if add_residual, then add residual_weight (init to 0)
        self.residual_weight = nn.Parameter(torch.zeros(1))

    def forward(self, embed: Tensor) -> Tensor:
        """Forward pass (Wv).

        Args:
            embed (Tensor): Input tensor.

        """
        output1 = self.linear1(embed)
        output1 = F.relu(output1)
        output2 = self.linear2(output1)

        if self._add_residual:
            output2 = self.residual_weight * output2 + embed

        return output2

    def get_config_dict(self) -> Dict:
        """Get config dict."""
        return {
            "in_features": self.in_features,
            "hidden_features": self.hidden_features,
            "out_features": self.out_features,
            "bias": self.bias,
            "add_residual": self._add_residual,
        }

In [None]:
custom_adapter = CustomNN(
    768,  # input dimension
    2048,  # hidden dimension
    768,  # output dimension
    bias=True,
    add_residual=True,
)

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="custom_modele5_output_test",
    model_checkpoint_path="custom_model_ck",
    adapter_model=custom_adapter,
    epochs=25,
    verbose=True,
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model_custom = finetune_engine.get_finetuned_model(
    adapter_cls=custom_adapter
)

In [None]:
# load model from checkpoint in the midde
embed_model_custom_ckp = AdapterEmbeddingModel(
    base_embed_model,
    "custom_model_ck/step_000",
    CustomNN,
)



## Evaluate 

In [None]:
ft_val_results_custom = evaluate(val_dataset, embed_model_custom_ckp)
pd.DataFrame(ft_val_results_custom).is_hit.value_counts(normalize=True)

## Classification

In [None]:
# embs_train_orig=model_orig.encode(X_train)
embs_train_finet_cust_v2=[embed_model_custom_ckp.get_text_embedding(t) for t in X_train]

In [None]:
# embs_test_orig=model_orig.encode(X_test)
embs_test_finet_cust_v2=[embed_model_custom_ckp.get_text_embedding(t) for t in X_test]

In [None]:
clf_finet_cust_v2=LinearSVC()
clf_finet_cust_v2.fit(embs_train_finet_cust_v2, y_train)

In [None]:
pred_labels = clf_finet_cust_v2.predict(embs_test_finet_cust_v2)
print(classification_report(y_test, pred_labels))