In [None]:
import os

def load_documents_from_directory(directory_path):
    documents = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
                documents[filename] = file.read()
    return documents

# Example usage
directory_path = 'path_to_your_text_files_directory'
documents = load_documents_from_directory(directory_path)
print(f"Loaded {len(documents)} documents.")




In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Function to create embeddings
def create_embeddings(documents, model_name='sentence-transformers/all-MiniLM-L6-v2', embedding_type='sentence'):
    model = SentenceTransformer(model_name)
    embeddings = {}
    
    for doc_name, doc_content in documents.items():
        if embedding_type == 'character':
            embeddings[doc_name] = model.encode([c for c in doc_content])
        elif embedding_type == 'word':
            embeddings[doc_name] = model.encode(doc_content.split())
        elif embedding_type == 'sentence':
            embeddings[doc_name] = model.encode(doc_content.split('.'))
        elif embedding_type == 'document':
            embeddings[doc_name] = model.encode([doc_content])
        else:
            raise ValueError(f"Unknown embedding type: {embedding_type}")
    
    return embeddings

# Example usage
embeddings = create_embeddings(documents, embedding_type='document')
print(f"Created embeddings for {len(embeddings)} documents.")

In [None]:
import pinecone

# Initialize Pinecone
pinecone.init(api_key='your_pinecone_api_key', environment='your_pinecone_environment')

# Create Pinecone index
index_name = 'rag-workshop'
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=384)  # Adjust dimension according to your model
index = pinecone.Index(index_name)

# Function to add embeddings to Pinecone
def add_embeddings_to_pinecone(embeddings):
    for doc_name, embedding in embeddings.items():
        index.upsert([(doc_name, embedding.tolist())])
    
# Example usage
add_embeddings_to_pinecone(embeddings)
print("Embeddings added to Pinecone.")

In [None]:
from transformers import pipeline

# Initialize a language model pipeline
llm_pipeline = pipeline('text-generation', model='gpt-3.5-turbo')  # Replace with appropriate model

# Function to query the vector database and generate a response
def query_rag_system(query, top_k=5):
    query_embedding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').encode([query])
    search_results = index.query(query_embedding.tolist(), top_k=top_k)
    
    # Retrieve documents
    retrieved_docs = [result['id'] for result in search_results['matches']]
    
    # Generate response using LLM
    context = " ".join([documents[doc_id] for doc_id in retrieved_docs])
    response = llm_pipeline(f"Context: {context}\nQuestion: {query}\nAnswer: ")
    
    return response[0]['generated_text']

# Example usage
query = "What is the content of document X?"
response = query_rag_system(query)
print(response)
