In [4]:
import re
from dotenv import load_dotenv
import wikipedia
from sentence_transformers import SentenceTransformer
import tqdm
import time
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
load_dotenv()

True

In [5]:
def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()  # Record the start time
        result = func(*args, **kwargs)  # Call the wrapped function
        end_time = time.time()  # Record the end time
        elapsed_time = end_time - start_time  # Calculate elapsed time
        print(f"Function '{func.__name__}' took {elapsed_time:.6f} seconds to execute.")
        return result  # Return the result of the wrapped function
    return wrapper

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n\n", "\n\n",".","\n", " "],  # Prioritize splitting by paragraphs, then sentences, then spaces
    chunk_size=300,  # Maximum size of each chunk
    chunk_overlap=50  # Overlap to maintain context between chunks
)

# Clean the individual chunks
def clean_chunk(chunk):
    cleaned_chunk = chunk.strip()  # Remove leading/trailing whitespace
    cleaned_chunk = re.sub(r"\n+", " ", cleaned_chunk)
    cleaned_chunk = re.sub(r"\s+", " ", cleaned_chunk)
    cleaned_chunk = re.sub(r'(?:==|===|=).*?(?==|===|=|$)', '', cleaned_chunk)
    if cleaned_chunk.startswith("."):
        cleaned_chunk = cleaned_chunk[1:]  # Remove the first character (the period)
    return cleaned_chunk.strip()

# # Apply the cleaning function to each chunk
# chunks = text_splitter.split_text(page.content)
# cleaned_chunks = [clean_chunk(chunk) for chunk in chunks] 
# cleaned_chunks = [chunk for chunk in cleaned_chunks if len(chunk) > 1]

In [31]:
@time_it
def scrape_wiki():
    temp_results = wikipedia.search("Programming")
    temp_results2 = wikipedia.search("Universe")
    results = []
    for temp in temp_results:
        results.extend(wikipedia.search(temp))
    for temp in temp_results2:
        results.extend(wikipedia.search(temp))
    results = list(set(results))

    cleaned_chunks = []
    for result in results:
        try:
            temp_result = wikipedia.page(result)
        except wikipedia.DisambiguationError as e:
            # Select the first option from the disambiguation list
            try:
                temp_result = wikipedia.page(e.options[0])
            except wikipedia.PageError:
                print(f"Page not found for {e.options[0]}")
                continue  # Skip to the next result
        except wikipedia.PageError:
            print(f"Page not found for {result}")
            continue  # Skip to the next result

        chunks = text_splitter.split_text(temp_result.content)
        cleaned_chunk = [clean_chunk(chunk) for chunk in chunks]
        cleaned_chunks.extend(cleaned_chunk)

    cleaned_chunks = [chunk for chunk in cleaned_chunks if len(chunk) > 1]
    return cleaned_chunks
#cleaned_chunks = scrape_wiki()

In [32]:
len(cleaned_chunks)

192

In [134]:
with open('cleaned_chunks.txt', 'w',encoding='utf-8') as file:
    for item in cleaned_chunks:
        file.write(f"{item}\n")

In [7]:
cleaned_chunks = []
with open('cleaned_chunks.txt', 'r', encoding='utf-8') as file:
    cleaned_chunks = [line.strip() for line in file.readlines()]

In [8]:
embeddings = model.encode(cleaned_chunks)

In [9]:
wiki_data = []
for i in range(len(cleaned_chunks)):
    wiki_data.append(
        {
            'text':cleaned_chunks[i],
            'vector':embeddings[i].tolist()
        }
    )

# CHROMA

In [10]:
import chromadb
chroma_client = chromadb.Client()

In [11]:
from chromadb.utils import embedding_functions
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="BAAI/bge-small-en-v1.5"
)

In [None]:
chroma_client.delete_collection(name='Programming_Universe')

### With embedder in chroma itself

In [10]:
@time_it
def chroma_insert_with_embedder(cleaned_chunks):
    collection = chroma_client.create_collection(name="Programming_Universe",embedding_function=sentence_transformer_ef)
    for i in tqdm(range(0, len(cleaned_chunks), 1000)):
        batch = [s for s in cleaned_chunks[i:i + 1000]]
        id_batch = [f'chunk_{i}_{j}' for j in range(len(cleaned_chunks[i:i+1000]))]
        collection.add(
            documents=batch,
            ids=id_batch
        )

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 15/15 [05:35<00:00, 22.35s/it]


### Without embedder in Chroma, text are manually embedded

In [13]:
@time_it
def chroma_insert_without_embedder(wiki_data):
    collection = chroma_client.create_collection(name="Programming_Universe")
    for i in range(0, len(wiki_data), 1000):
        batch = [s['text'] for s in wiki_data[i:i + 1000]]
        id_batch = [f'chunk_{i}_{j}' for j in range(len(wiki_data[i:i+1000]))]
        embedding_batch = [s['vector'] for s in wiki_data[i:i+1000]]
        collection.add(
            documents=batch,
            ids=id_batch,
            embeddings=embedding_batch
        )
    return collection
#chroma_client.delete_collection(name='Programming_Universe')
chroma_insert_without_embedder(wiki_data)

Function 'chroma_insert_without_embedder' took 10.190860 seconds to execute.


Collection(name=Programming_Universe)

In [14]:
questions = ['What is pydoc?', 'What is PyS60?','What does it claim to strive?','What are the main goals?','What are the core philosophies?']

In [15]:
@time_it
def chroma_query(query):
    query_embedding = model.encode(query).tolist()
    collection = chroma_client.get_collection('Programming_Universe')
    results = collection.query(
        query_embeddings=[query_embedding], 
        n_results=5 # how many results to return
    )
    return results
chroma_query('What is python')

Function 'chroma_query' took 0.031608 seconds to execute.


{'ids': [['chunk_1000_524',
   'chunk_1000_572',
   'chunk_4000_526',
   'chunk_1000_704',
   'chunk_4000_604']],
 'embeddings': None,
 'documents': [['Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically type-checked and garbage-collected',
   'Python is meant to be an easily readable language. Its formatting is visually uncluttered and often uses English keywords where other languages use punctuation',
   'The syntax of the Python programming language is the set of rules that defines how a Python program will be written and interpreted (by both the runtime system and by human readers). The Python language has many similarities to Perl, C, and Java',
   'Python is used extensively in the information security industry, including in exploit development. Most of the Sugar software for the One Laptop per Child XO, developed at Sugar Labs as of 2008, is written in Pytho

# LANCEDB

In [16]:
import lancedb
from lancedb.pydantic import LanceModel, Vector

In [21]:
# Connect to the LanceDB database
db = lancedb.connect("/tmp/db")

# Define the schema for the table
class Words(LanceModel):
    text: str 
    vector: Vector(384)

# Create the table if it doesn't already exist
table = db.create_table("Programming_Universe_lancedb", schema=Words)

In [20]:
# Delete the table named "words"
db.drop_table("Programming_Universe_lancedb")

In [22]:
@time_it
def lancedb_insert(strings, batch_size=1000):
    for i in range(0, len(strings), batch_size):
        batch = [s for s in strings[i:i + batch_size]]
        table.add(batch)
        #print(f"Added batch {i // batch_size + 1} with {len(batch)} records.")
lancedb_insert(wiki_data)

Function 'lancedb_insert' took 0.429397 seconds to execute.


In [36]:
@time_it
def lancedb_query(query_string):      
    query_embedding = model.encode(query_string).tolist()  
    results = table.search(query_embedding)    
    return results.limit(1).to_pydantic(Words)
lancedb_query("What is python")

Function 'lancedb_query' took 0.037681 seconds to execute.


[Words(text='Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically type-checked and garbage-collected', vector=FixedSizeList(dim=384))]

# WEAVIATE

In [37]:
import weaviate
import weaviate.classes as wvc

In [38]:
weaviate_client = weaviate.connect_to_local()

In [39]:
wiki_objs = list()
for i, d in enumerate(wiki_data):
    wiki_objs.append(wvc.data.DataObject(
        properties={
            "text": d["text"],            
        },
        vector=d["vector"]
    ))

In [40]:
weaviate_client.collections.delete(name="Python")

In [41]:
import weaviate.classes as wvc

# Create the collection. Weaviate's autoschema feature will infer properties when importing.
weaviate_collection = weaviate_client.collections.create(
    "Python",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
)

In [42]:
@time_it
def weaviate_insert(wiki_objs):
    weaviate_collection = weaviate_client.collections.get("Python")
    for i in range(0, len(wiki_objs), 1000):
        batch = wiki_objs[i:i + 1000]
        weaviate_collection.data.insert_many(batch)
weaviate_insert(wiki_objs)

Function 'weaviate_insert' took 4.309588 seconds to execute.


In [43]:
@time_it
def weaviate_query(query):
    query_vector = model.encode(query)

    response = weaviate_collection.query.near_vector(
        near_vector=query_vector.tolist(),
        limit=2,
        return_metadata=wvc.query.MetadataQuery(certainty=True)
    )
    return response
response = weaviate_query('What is python')

print(response.objects[0].properties)

Function 'weaviate_query' took 0.044614 seconds to execute.
{'text': 'Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically type-checked and garbage-collected'}


## For using llama model and nomic embedder

In [99]:
import weaviate
from weaviate.classes.config import Configure

client = weaviate.connect_to_local()

questions = client.collections.create(
    name="WikipediaPage",
    vectorizer_config=Configure.Vectorizer.text2vec_ollama(     # Configure the Ollama embedding integration
        api_endpoint="http://host.docker.internal:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
        model="nomic-embed-text",                               # The model to use
    ),
    generative_config=Configure.Generative.ollama(              # Configure the Ollama generative integration
        api_endpoint="http://host.docker.internal:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
        model="llama3.2",                                       # The model to use
    )
)

client.close()  # Free up resources