In [1]:
#!pip install transformers torch langchain

In [2]:
import random
import numpy as np
from transformers import DistilBertModel, DistilBertTokenizer
import torch
from torch.quantization import quantize_dynamic
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#!pip install transformers torch chromadb langchain openai

In [4]:
# Load and Quantize the Model
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)
quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = quantized_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [6]:
# Generate and Index 50 Documents
sample_texts = [
    "Document about artificial intelligence.",
    "Research on machine learning.",
    "Introduction to natural language processing.",
    "The history of computer science.",
    "Advances in deep learning techniques.",
    "Understanding neural networks.",
    "The impact of AI on society.",
    "Future of robotics.",
    "Applications of AI in healthcare.",
    "Ethics in artificial intelligence.",
    "Data science methodologies.",
    "Statistical analysis and AI.",
    "Overview of reinforcement learning.",
    "Basics of supervised learning.",
    "Unsupervised learning algorithms.",
    "AI in autonomous vehicles.",
    "AI-driven personal assistants.",
    "Machine learning in finance.",
    "AI for predictive analytics.",
    "Trends in AI research.",
    "Challenges in AI development.",
    "AI for image recognition.",
    "Natural language understanding.",
    "AI in education.",
    "AI in agriculture.",
    "AI for climate change.",
    "AI in cybersecurity.",
    "AI in marketing.",
    "AI in customer service.",
    "The role of big data in AI.",
    "AI and IoT integration.",
    "AI for supply chain management.",
    "AI in human resources.",
    "AI in entertainment.",
    "AI in legal services.",
    "AI in manufacturing.",
    "AI for energy management.",
    "AI for disaster response.",
    "AI for social good.",
    "AI in transportation.",
    "AI for medical diagnosis.",
    "AI in sports analytics.",
    "AI in retail.",
    "AI for personalized recommendations.",
    "AI and quantum computing.",
    "AI for fraud detection.",
    "AI for natural disaster prediction.",
    "AI in space exploration.",
    "AI in game development."
]

In [7]:
documents = random.sample(sample_texts * 2, 50)
document_embeddings = [get_embeddings(doc) for doc in documents]

In [8]:
# Retrieve Documents
def retrieve_documents(query, document_embeddings, documents, top_k=5):
    query_embedding = get_embeddings(query)
    similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    return [(documents[i], similarities[i]) for i in top_indices]

In [9]:
query = "AI applications in healthcare."
results = retrieve_documents(query, document_embeddings, documents)

In [10]:
# Print Retrieved Documents
for doc, similarity in results:
    print(f"Document: {doc}, Similarity: {similarity}")

Document: Applications of AI in healthcare., Similarity: 0.9422410130500793
Document: Challenges in AI development., Similarity: 0.8960748314857483
Document: AI in sports analytics., Similarity: 0.8846433162689209
Document: AI in sports analytics., Similarity: 0.8846433162689209
Document: AI in autonomous vehicles., Similarity: 0.8698293566703796
