### Using the filtered data with LLMs

Copyright &copy; 2024 Praneeth Vadlapati

In [1]:
import os
import time

from groq import Groq
import pandas as pd
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

from common_functions import get_latest_filename, display_md

Load the filtered data

In [2]:
last_file_path = get_latest_filename('filtered')
df = pd.read_csv(last_file_path)
if 'finetune_text' in df.columns:
	df.drop('finetune_text', axis=1, inplace=True)
	df.rename(columns={'finetune_text': 'text'}, inplace=True)
df = df[['text', 'id']]  # allow only some columns  # , 'date'
# df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%dT%H:%M:%SZ') # string to date

print(f'Data size: {df.shape[0]} rows')
df.head(2)

Data size: 42 rows


Unnamed: 0,text,id
0,We want to know how to best serve you. Please ...,<urn:uuid:faff9b64-041c-4b98-8be4-7ff2a02e4b8d>
1,Welcome to AnnieMation’s webpage. We are an in...,<urn:uuid:76d0f406-290e-41c9-a4bc-2062e6fc6296>


Prepare the model

In [3]:
embed_model = SentenceTransformer(os.getenv('EMBED_MODEL'),
                                cache_folder='.huggingface', trust_remote_code=True)

def get_embedding(text):
	# import ollama ; response = ollama.embeddings(model=os.getenv('EMBED_MODEL'),
	# 	prompt=text[:embedding_chunk_size]) ; return response['embedding']
	embeddings = embed_model.encode([text])
	return embeddings[0]

embedding_chunk_size = len(get_embedding('test'))

pc = Pinecone()
index_name = os.getenv('PINECONE_INDEX_NAME')

if index_name in pc.list_indexes().names():
	print(f'Loading existing index...')
	index_description = pc.describe_index(index_name)
	if index_description.dimension != embedding_chunk_size:
		print(f'Index dimension mismatch, deleting old index...')
		pc.delete_index(index_name)  # delete index

if index_name not in pc.list_indexes().names():
	print(f'Creating new index...')
	pc.create_index(
		index_name,
		embedding_chunk_size,
		ServerlessSpec(cloud='aws', region='us-east-1'),
	)
	# wait for index to be ready
	while not pc.describe_index(index_name).status['ready']:
		time.sleep(1)

index = pc.Index(index_name)

<All keys matched successfully>


Loading existing index...


In [4]:
groq_client = Groq()

def get_bot_response(messages, max_retries=3):
	for _ in range(max_retries):
		try:
			chat_completion = groq_client.chat.completions.create(
				messages=messages,
				model=os.getenv('GROQ_MODEL'),
			)
			response = chat_completion.choices[0].message.content
			response = response.strip()
			if not response:
				raise Exception('Empty response from the bot')
			return response
		except Exception as e:
			print(f'Error: {e}. Retrying...')
			time.sleep(1)
	raise Exception('No response from the bot')

def ask_bot(prompt, relevant_context=None):
	messages = [{'role': 'user', 'content': prompt}]
	if relevant_context:
		messages.insert(0, {
			'role': 'system',
			'content': 'Refer this if relevant to the users query. ' \
						'Dont mention in the response this text exists. ' \
						'Dont mention "According to the text". \n' \
						f'{relevant_context}'
		})
	return get_bot_response(messages)


def create_RAG_query(user_query, max_retries=3):
	'Create a query for RAG to find relevant context'
	messages = [{'role': 'user', 'content': 
					'Create a query in only English for semantic search to find relevant context.' \
					'Return query in triple backticks ```<query here>```. \n' \
					f'A user asked: `{user_query}`. \n\n' \
					# 'If search is not needed, return ```NOT_REQUIRED```.'
	}]
	for _ in range(max_retries):
		try:
			response = get_bot_response(messages, max_retries=max_retries)
			response = response.split('```')[1].strip()
			if not response:
				raise Exception('Empty response from the bot')
			return response
		except Exception as e:
			print(f'Error: {e}. Retrying...')


def get_embeddings(texts):
	embeddings = []
	for text in texts:
		embeddings.append(get_embedding(text))
	return embeddings

def ask_query(query, display=False, use_memory=False, top_k=3, use_RAG_query=False):
	if not use_memory:
		response = ask_bot(query)
	else:
		if use_RAG_query:
			rag_query = create_RAG_query(query)
			# print(f'RAG Query: {rag_query}')
			query_embedding = get_embedding(rag_query)
		else:
			query_embedding = get_embedding(query)
		query_result = index.query(
			vector=query_embedding.tolist(), top_k=top_k, include_metadata=True
		)
		context_limit = 300
		relevant_context = ' '.join([  # combine all top_k relevant contexts
			match.metadata.get('text')[:context_limit] for match in query_result.matches
		])
		response = ask_bot(query, relevant_context)
	if display:
		display_md(f'**Query:** {query} \n\n **Response:** \n {response}')
	else:
		return response

def print_time_taken(start_time):
	time_sec = time.time() - start_time
	time_min, time_sec = divmod(time_sec, 60)
	if time_min:
		display_text = f'{time_min:.0f} min {time_sec:.0f} sec'
	else:
		display_text = f'{time_sec:.0f} sec'
	print(f'Completed in {display_text}')

Prepare the vector DB

In [5]:
def insert_data(df):
	try:
		texts = df['text'].tolist()
		ids = df['id'].tolist()

		print(f'Generating embeddings...')
		start_time = time.time()
		embeddings = get_embeddings(texts)
		vectors = [
			{'id': id, 'values': embedding, 'metadata': {'text': text}}
			for id, embedding, text in zip(ids, embeddings, texts)
		]
		print_time_taken(start_time)

		print(f'Inserting the data...')
		start_time = time.time()
		index.upsert(vectors)
		print('Data inserted successfully')
		print_time_taken(start_time)

	except Exception as e:
		print(f'Error inserting data: {e}')
		raise e

insert_data(df)

Generating embeddings...
Completed in 4 sec
Inserting the data...
Data inserted successfully
Completed in 5 sec


In [11]:
# check sample to decide a sample query
sample = df['text'].tolist()[5]
display_md(f'**Sample data**: \n {sample}')

**Sample data**: <br> Bengaluru, 6th May 2023: Indian Men’s Hockey Team’s newly-appointed Analytical Coach Rhett Halkett arrived in India on Thursday and received a warm welcome in New Delhi from Hockey India Secretary General Shri Bhola Nath Singh. After reaching Delhi, the former South African international flew to Bengaluru to join the ongoing Indian Men’s Hockey Team Camp at the SAI Centre under newly-appointed Indian Men’s Hockey Team Chief Coach Craig Fulton.<br>Meanwhile, Alan Tan, who was recently appointed as the Scientific Advisor for the Indian Men’s Hockey Team, also arrived in New Delhi on Saturday and was welcomed by Hockey India CEO Elena Norman. He will also be flying to Bengaluru to join the Indian camp.<br>Both Rhett and Alan will be working closely with Chief Coach Fulton as India begin their preparations for the upcoming European leg of the FIH Men’s Hockey Pro League 2022/23 which is set to begin from 26th May onwards. India will face off in double-headers against Belgium and Great Britain in London, England, and then will travel to Eindhoven, Netherlands to face off in double-headers against the Netherlands and Argentina.<br>India also have a packed calendar this year with the Hero Asian Champions Trophy Chennai 2023 set to take place in Chennai in August, which will be followed by 2023 Asian Games in Hangzhou, China.<br>Hockey India President Padma Shri Dr. Dilip Tirkey said, “We are delighted to have experienced individuals such as Rhett Halkett and Alan Tan join the Indian camp. With major competitions set to take place in the coming months, the duo, along with Coach Fulton will be integral in ensuring the players are ready to face off against the top teams in the world. We welcome them to India and we are confident they will help in leading the team to new heights this year.”<br>Hockey India Secretary General Shri Bhola Nath Singh added, “Rhett Halkett and Alan Tan have several years of experience under their belt and we are delighted to have them join the Indian team. We truly believe the duo, along with Chief Coach Craig Fulton, will push our Indian players and help them not only grow as individuals but also as a team. We give them our best wishes on their journey and hope to see them earn many laurels for the country during their stint.”

Querying LLM without RAG

In [7]:
# Sample query based on the sample data
sample_query = 'Mens Hockey Team recently welcomed which new members after March 2023? On which date?'
ask_query(sample_query, display=True)

**Query:** Mens Hockey Team recently welcomed which new members after March 2023? On which date? <br><br> **Response:** <br> I'm happy to help! However, since I'm a large language model, I don't have real-time access to information and updates. Additionally, there may be multiple men's hockey teams around the world, so it's unclear which specific team you're referring to.<br><br>Could you please provide more context or clarify which men's hockey team you're interested in, such as a national team, a professional league, or a collegiate team? Additionally, if you have any specific details about the new members, like their names or positions, it would be helpful.

Querying LLM using RAG

In [8]:
# ask_query(sample_query, display=True, use_memory=True)
ask_query(sample_query, display=True, use_memory=True, use_RAG_query=True)

**Query:** Mens Hockey Team recently welcomed which new members after March 2023? On which date? <br><br> **Response:** <br> According to the given information, the Indian Men's Hockey Team welcomed a new Analytical Coach, Rhett Halkett, who arrived in India on May 6th, 2023.

In [9]:
# index.delete(delete_all=True)
## pc.delete_index(index_name)