In [1]:
# Split the input text into chunks of specified size (1024)
def split_into_chunks(text, chunk_size=1024):
  chunks = []
  for i in range(0, len(text), chunk_size):
    chunks.append(text[i:i+chunk_size])

  return chunks

In [10]:
# Read Data
import csv

chunks = []

# Load the file as a CSV
with open("./1_Using_Chatbot_API/mini-llama-articles.csv", mode="r", encoding="utf-8") as file:
  csv_reader = csv.reader(file, delimiter="\t")

  for idx, row in enumerate( csv_reader ):
    if idx == 0: continue; # Skip header row
    chunks.extend(split_into_chunks(row[1]))

print("number of articles:", idx)
print("number of chunks:", len(chunks))

number of articles: 14
number of chunks: 174


In [11]:
import pandas as pd

# Convert the list to a Pandas Dataframe
df = pd.DataFrame(chunks, columns=['chunk'])

print(df.keys())

Index(['chunk'], dtype='object')


In [12]:
# Generate data embeddings using the OpenAI text-embedding-3-small model

from openai import OpenAI

client = OpenAI()

# Defining a function that converts a text to embedding vector using OpenAI's Ada model.
def get_embedding(text):
  try:
    # Remove newlines
    text = text.replace("\n", " ")
    res = client.embeddings.create(input=[text], model="text-embedding-3-small")

    return res.data[0].embedding

  except:
        return None

In [13]:
from tqdm.notebook import tqdm
import numpy as np

# Generate embedding
print("Generating embeddings...")
embeddings = []
for index, row in tqdm(df.iterrows()):
  embeddings.append(get_embedding(row['chunk']))

# Add the "embedding" column to the dataframe
embeddings_values = pd.Series(embeddings)
df.insert(loc=1, column='embedding', value=embeddings_values)

Generating embeddings...


0it [00:00, ?it/s]

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# The question we want to ask the model.
QUESTION = "How many parameters LLaMA2 model has?"
QUESTION_emb = get_embedding(QUESTION)

# The similarity between the questions and each part of the essay.
cosine_similarities = cosine_similarity([QUESTION_emb], df['embedding'].tolist())

print(cosine_similarities)

[[0.46773341 0.46912591 0.25978152 0.2938158  0.31967458 0.40164521
  0.41504525 0.4525753  0.45929084 0.12604131 0.11753091 0.01344322
  0.2260097  0.2142525  0.10143629 0.33072012 0.10745194 0.34694871
  0.16311813 0.08741076 0.34824215 0.22839518 0.19205032 0.26476001
  0.24955816 0.34833881 0.24834228 0.32762574 0.41444235 0.41335705
  0.46364893 0.38345735 0.46855645 0.35642136 0.35398538 0.30275087
  0.2994191  0.29257011 0.40031753 0.46468319 0.3947144  0.41046847
  0.44707962 0.43177834 0.35912069 0.33981274 0.51355581 0.2092876
  0.40203406 0.32830316 0.4283271  0.48267992 0.45033212 0.3425906
  0.32084533 0.42600947 0.24656291 0.18087562 0.2366496  0.34272949
  0.34377754 0.20473187 0.19763453 0.22442031 0.21108372 0.42298466
  0.26382744 0.30427213 0.33608375 0.38371545 0.23529731 0.24347982
  0.37074994 0.28020178 0.49052503 0.53047743 0.3782057  0.4377435
  0.37767354 0.39259992 0.30086669 0.41712126 0.46747369 0.45419194
  0.35156058 0.21228866 0.42623473 0.31603508 0.440

In [17]:
# Find the N highest scored chunks
import numpy as np

number_of_chunks_to_retrieve = 3

# Sort and find the index of N highest scored chunks
indices = np.argsort(cosine_similarities[0])[::-1][:number_of_chunks_to_retrieve]
print(indices)

[114  75  89]


In [18]:
# Look at the highest scored retrieved pieces of text
for idx, item in enumerate(df.chunk[indices]):
  print(f"> Chunk {idx+1}")
  print(item)
  print("----")

> Chunk 1
by Meta that ventures into both the AI and academic spaces. The model aims to help researchers, scientists, and engineers advance their work in exploring AI applications. It will be released under a non-commercial license to prevent misuse, and access will be granted to academic researchers, individuals, and organizations affiliated with the government, civil society, academia, and industry research facilities on a selective case-by-case basis. The sharing of codes and weights allows other researchers to test new approaches in LLMs. The LLaMA models have a range of 7 billion to 65 billion parameters. LLaMA-65B can be compared to DeepMind's Chinchilla and Google's PaLM. Publicly available unlabeled data was used to train these models, and training smaller foundational models require less computing power and resources. LLaMA 65B and 33B have been trained on 1.4 trillion tokens in 20 different languages, and according to the Facebook Artificial Intelligence Research (FAIR) team,

In [20]:
df.to_csv("mini-llama-articles-with-embeddings.csv", index=False, sep="\t", encoding="utf-8")