In [35]:
%pip install -q llama-index==0.10.18 llama-index-llms-groq==0.1.3 groq==0.4.2 llama-index-embeddings-huggingface==0.2.0

In [36]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    ServiceContext,
    load_index_from_storage
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.groq import Groq
# import os
# from dotenv import load_dotenv
# load_dotenv()
import warnings
warnings.filterwarnings('ignore')

In [37]:
from google.colab import userdata
GROG_API_KEY = userdata.get('GROG_API_KEY')

In [38]:
!wget https://myweb.sabanciuniv.edu/rdehkharghani/files/2016/02/The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf

--2024-11-15 21:08:42--  https://myweb.sabanciuniv.edu/rdehkharghani/files/2016/02/The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf
Resolving myweb.sabanciuniv.edu (myweb.sabanciuniv.edu)... 159.20.64.81
Connecting to myweb.sabanciuniv.edu (myweb.sabanciuniv.edu)|159.20.64.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12585336 (12M) [application/pdf]
Saving to: ‘The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf.2’


2024-11-15 21:08:46 (4.43 MB/s) - ‘The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf.2’ saved [12585336/12585336]



In [39]:
# data ingestion
reader = SimpleDirectoryReader(input_files=["./The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf"])
documents = reader.load_data()

In [40]:
len(documents) #pages

740

In [41]:
documents[30].metadata

{'page_label': 'xxx',
 'file_name': 'The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf',
 'file_path': 'The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf',
 'file_type': 'application/pdf',
 'file_size': 12585336,
 'creation_date': '2024-11-15',
 'last_modified_date': '2016-02-10'}

In [42]:
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
  # chunk size number of tokens, total size = chunk overlap + chunk size + chunk overlap
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes:   0%|          | 0/740 [00:00<?, ?it/s]

In [43]:
len(nodes) #

784

In [44]:
nodes[0].metadata

{'page_label': 'Cover',
 'file_name': 'The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf',
 'file_path': 'The-Morgan-Kaufmann-Series-in-Data-Management-Systems-Jiawei-Han-Micheline-Kamber-Jian-Pei-Data-Mining.-Concepts-and-Techniques-3rd-Edition-Morgan-Kaufmann-2011.pdf',
 'file_type': 'application/pdf',
 'file_size': 12585336,
 'creation_date': '2024-11-15',
 'last_modified_date': '2016-02-10'}

In [45]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [46]:
import numpy as np

from tabulate import tabulate

# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Sample texts
texts = [
    "The weather is nice today.",
    "I love eating pizza for dinner.",
    "Apple just released iPhone 17",
    "This red apple smells good",
    "Tesla stock rises as Trump wins the election",
    "Should I Sell My Individual Stocks and Reinvest in S&P 500/Nasdaq 100?"
]

# Generate embeddings for the texts
embeddings = [embed_model.get_text_embedding(text) for text in texts]

# Function to calculate cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate similarities and prepare table data
table_data = []
for i in range(len(texts)):
    row = [f"Text {i+1}"]
    for j in range(len(texts)):
        if i == j:
            row.append("1.0000")
        elif j > i:
            similarity = cosine_similarity(embeddings[i], embeddings[j])
            row.append(f"{similarity:.4f}")
        else:
            row.append("")
    table_data.append(row)

# Prepare headers
headers = [""] + [f"Text {i+1}" for i in range(len(texts))]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

# Print the text content for reference
print("\nText content:")
for i, text in enumerate(texts, 1):
    print(f"Text {i}: {text}")

+--------+----------+----------+----------+----------+----------+----------+
|        | Text 1   | Text 2   | Text 3   | Text 4   | Text 5   |   Text 6 |
| Text 1 | 1.0000   | 0.1364   | 0.0529   | 0.1190   | 0.0408   |   0.0057 |
+--------+----------+----------+----------+----------+----------+----------+
| Text 2 |          | 1.0000   | -0.0100  | 0.0830   | 0.0277   |   0.0185 |
+--------+----------+----------+----------+----------+----------+----------+
| Text 3 |          |          | 1.0000   | 0.2874   | 0.1171   |   0.0379 |
+--------+----------+----------+----------+----------+----------+----------+
| Text 4 |          |          |          | 1.0000   | 0.1528   |  -0.0115 |
+--------+----------+----------+----------+----------+----------+----------+
| Text 5 |          |          |          |          | 1.0000   |   0.2119 |
+--------+----------+----------+----------+----------+----------+----------+
| Text 6 |          |          |          |          |          |   1      |

In [47]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")

In [48]:
llm = Groq(model="llama-3.1-70b-versatile", api_key=GROG_API_KEY)

In [49]:
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

In [50]:
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, service_context=service_context, node_parser=nodes)

Parsing nodes:   0%|          | 0/740 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/784 [00:00<?, ?it/s]

In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir /content/drive/MyDrive/Gen.AI/DB

In [53]:
vectDB = "/content/drive/MyDrive/Gen.AI/DB/textbookstorage"

In [54]:
vector_index.storage_context.persist(persist_dir=vectDB)

# Load

In [55]:
storage_context = StorageContext.from_defaults(persist_dir=vectDB)

In [None]:
index = load_index_from_storage(storage_context, service_context=service_context)

In [None]:
query_engine = index.as_query_engine(service_context=service_context)

In [None]:
query = "Explain cosine similarity"
resp = query_engine.query(query)

In [59]:
import textwrap

def Print(text, width=80, **args):
    lines = text.split('\n')  # Split the text into lines based on original line breaks
    wrapped_lines = []
    for line in lines:
        wrapped_lines.extend(textwrap.wrap(line, width=width))  # Wrap each line individually
    print('\n'.join(wrapped_lines), **args)



In [60]:
Print(resp.response)

Cosine similarity is a measure of similarity that can be used to compare
documents or vectors. It is defined as the cosine of the angle between two
vectors. The cosine similarity function is given by the dot product of the two
vectors divided by the product of their Euclidean norms.
Conceptually, it measures the cosine of the angle between two vectors. A cosine
value of 0 means that the two vectors are at 90 degrees to each other
(orthogonal) and have no match. The closer the cosine value to 1, the smaller
the angle and the greater the match between vectors.
In the context of term-frequency vectors, cosine similarity focuses on the words
that the two documents do have in common, and the occurrence frequency of such
words. It ignores zero-matches, making it suitable for sparse numeric data.
When attributes are binary-valued, the cosine similarity function can be
interpreted in terms of shared features or attributes. It measures the relative
possession of common attributes between two ob