In [26]:
# Connect to Weaviate Retriever and configure LLM
import dspy
from dspy.retrieve.weaviate_rm import WeaviateRM
import weaviate
import os

llm = dspy.OpenAI(model="gpt-3.5-turbo")
weaviate_client = weaviate.Client("http://localhost:8080",
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY"),
    }
)
retriever_model = WeaviateRM("WeaviateBlogChunk", weaviate_client=weaviate_client)
# Assumes the Weaviate collection has a text key `content`
dspy.settings.configure(lm=llm, rm=retriever_model) 

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [29]:
# class_name = "WeaviateBlogChunk"

# # 클래스 삭제 시도
# try:
#     weaviate_client.schema.delete_class(class_name)
#     print(f"클래스 '{class_name}'가 성공적으로 삭제되었습니다.")
# except weaviate.exceptions.SchemaValidationException as e:
#     print(f"클래스 삭제 중 오류 발생: {e}")

클래스 'WeaviateBlogChunk'가 성공적으로 삭제되었습니다.


In [31]:
import weaviate.classes.config as wvcc
schema = {
  "classes": [
    {
      "class": "WeaviateBlogChunk",
      "vectorizer": "text2vec-openai",
      "properties": [
        {
          "name": "content",
          "dataType": ["text"],
        },
        {
          "name": "author",
          "dataType": ["text"],
        }
      ],
      "moduleConfig": {
        "text2vec-transformers": {
          "model": "text-embedding-3-large"
        }
      }
    }
  ]
}

# 스키마 생성
weaviate_client.schema.create(schema)

In [33]:
# ## Connection check and collection config(docker)
# response = weaviate_client.collections.list_all()

# print(weaviate_client.is_connected())
# print(response)

In [34]:
# Chunk Blogs

import os
import re

def chunk_list(lst, chunk_size):
    """Break a list into chunks of the specified size."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def split_into_sentences(text):
    """Split text into sentences using regular expressions."""
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

def read_and_chunk_index_files(main_folder_path):
    """Read index.md files from subfolders, split into sentences, and chunk every 5 sentences."""
    blog_chunks = []
    for folder_name in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, folder_name)
        if os.path.isdir(subfolder_path):
            index_file_path = os.path.join(subfolder_path, 'index.mdx')
            if os.path.isfile(index_file_path):
                with open(index_file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    sentences = split_into_sentences(content)
                    sentence_chunks = chunk_list(sentences, 5)
                    sentence_chunks = [' '.join(chunk) for chunk in sentence_chunks]
                    blog_chunks.extend(sentence_chunks)
    return blog_chunks

# Example usage
main_folder_path = './blog'
blog_chunks = read_and_chunk_index_files(main_folder_path)


print(f"There are {len(blog_chunks)} chunks in the main folder path.")

There are 1413 chunks in the main folder path.


In [35]:
print(blog_chunks[0])

---
layout: post
title: What if you could understand your unstructured data? slug: understand-your-unstructured-data
authors: [laura]
date: 2021-01-20
tags: ['concepts']
image: ./img/hero.png
# canonical-url: https://medium.com/semi-technologies/what-if-you-could-understand-your-unstructured-data-92f0861e016
# canonical-name: Medium
description: "Learn how the AI-first vector database Weaviate unlocks the potential of unstructured data and why this is important."
---
![What if you could understand your unstructured data?](./img/hero.png)

<!-- truncate -->

## Intro
These days, more and more organizations are adopting a data-driven culture. Business processes and customer experience benefit from good data collection, management and analysis. But in order to really benefit from available data, it is essential to also understand the unstructured data, like free text in PDF documents, emails, invoices or voice transcriptions. Unstructured data is especially hard to index, manage and under

In [36]:
import weaviate
for blog_chunk in blog_chunks:
    blog_object = {
        "content": blog_chunk,
    }

    try:
        response = weaviate_client.data_object.create(
            data_object=blog_object,
            class_name="WeaviateBlogChunk"
        )
        print(f"삽입 성공, UUID: {response}")
    except Exception as e:
        print(f"데이터 삽입 중 오류 발생: {e}")

삽입 성공, UUID: 8d186bf2-cf71-4ac3-9c64-84688734fee5
삽입 성공, UUID: 34414982-161e-46e9-9472-382b877c2a9e
삽입 성공, UUID: 21aea85b-9fba-464c-8030-cc3178cd6401
삽입 성공, UUID: 87ca403c-b876-456c-b403-b0e437664af3
삽입 성공, UUID: daa9acc8-4b62-4a00-89ec-c38156e837f5
삽입 성공, UUID: fecc5bcb-9ce9-4581-b8f9-9fc6db2f6477
삽입 성공, UUID: 7f92e60b-7da7-4456-b70d-4fd2e8c0718e
삽입 성공, UUID: 9d28d8d1-927a-424b-a235-a7c66d5da565
삽입 성공, UUID: 45c0128d-3166-4251-a778-e81df40da599
삽입 성공, UUID: 50e7680b-bd7e-40bc-b661-a46776c79a0f
삽입 성공, UUID: ea18b8c6-c79b-4625-8ca7-61243e5b8b95
삽입 성공, UUID: 8bfe1c08-8637-46c7-956d-176e7f98b5d7
삽입 성공, UUID: 95dca9fe-b57e-499e-a185-bb96c7688f14
삽입 성공, UUID: ccca58c9-0fa0-4f26-aad9-1dcfd57980b8
삽입 성공, UUID: b0eab62a-74d1-4a3a-986e-b2b8451179e1
삽입 성공, UUID: 5ba92e90-da31-4eca-a6df-89aac19f4983
삽입 성공, UUID: d06d8f97-b1eb-4ab2-8034-adea823235c1
삽입 성공, UUID: b33166c3-6340-48d8-86f8-4ffa063a6b72
삽입 성공, UUID: 2d586a28-7715-4e5c-b690-1bb48ffa4a7b
삽입 성공, UUID: 927e4473-b7fa-4067-8f1d-62586bf1ab62


In [42]:
# BEFORE RAG, check dspy functions first.

print(dspy.settings.lm("Write a 3 line poem about neural networks."))
context_example = dspy.OpenAI(model="gpt-4-turbo-preview")

with dspy.context(llm=context_example):
    print(context_example("Write a 3 line poem about neural networks."))

['Neural networks, a web of connections so vast,\nLearning and adapting, from present to past,\nMimicking the brain, a future so vast.']
['In webs of neurons, thoughts intertwine,\nLearning from chaos, patterns they find.\nMinds of silicon, endlessly refined.']


In [43]:
# Load FAQs
import re

f = open("blog/faq.md")
markdown_content = f.read()

def parse_questions(markdown_content):
    # Regular expression pattern for finding questions
    question_pattern = r'#### Q: (.+?)\n'

    # Finding all questions
    questions = re.findall(question_pattern, markdown_content, re.DOTALL)

    return questions

# Parsing the markdown content to get only questions
questions = parse_questions(markdown_content)

# Displaying the first few extracted questions
questions[:5]  # Displaying only the first few for brevity

  f = open("blog/faq.md")


['Why would I use Weaviate as my vector database?',
 'What is the difference between Weaviate and for example Elasticsearch?',
 'Do you offer Weaviate as a managed service?',
 'How should I configure the size of my instance?',
 'Do I need to know about Docker (Compose) to use Weaviate?']

In [44]:
# ToDo, add random splitting -- maybe wrap this entire thing in a cross-validation loop
trainset = questions[:20] # 20 examples for training
devset = questions[20:30] # 10 examples for development
testset = questions[30:] # 14 examples for testing

trainset = [dspy.Example(question=question).with_inputs("question") for question in trainset]
devset = [dspy.Example(question=question).with_inputs("question") for question in devset]
testset = [dspy.Example(question=question).with_inputs("question") for question in testset]

print(devset[0])

Example({'question': 'Is there support to multiple versions of the query/document embedding models to co-exist at a given time? (helps with live experiments of new model versions)'}) (input_keys={'question'})


In [67]:
metricLM = dspy.OpenAI(model='gpt-4-turbo-preview', max_tokens=2048, model_type='chat')

# Signature for LLM assessments.

class Assess(dspy.Signature):
    """Assess the quality of an answer to a question."""
    
    context = dspy.InputField(desc="The context for answering the question.")
    assessed_question = dspy.InputField(desc="The evaluation criterion.")
    assessed_answer = dspy.InputField(desc="The answer to the question.")
    assessment_answer = dspy.OutputField(desc="A rating between 1 and 5. Only output the rating and nothing else.")

def llm_metric(gold, pred, trace=None):
    predicted_answer = pred.answer
    question = gold.question
    
    print(f"Test Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    
    detail = "Is the assessed answer detailed?"
    faithful = "Is the assessed text grounded in the context? Say no if it includes significant facts not in the context."
    overall = f"Please rate how well this answer answers the question, `{question}` based on the context.\n `{predicted_answer}`"
    
    with dspy.context(lm=metricLM):
        context = dspy.Retrieve(k=5)(question).passages
        detail = dspy.ChainOfThought(Assess)(context="N/A", assessed_question=detail, assessed_answer=predicted_answer)
        faithful = dspy.ChainOfThought(Assess)(context=context, assessed_question=faithful, assessed_answer=predicted_answer)
        overall = dspy.ChainOfThought(Assess)(context=context, assessed_question=overall, assessed_answer=predicted_answer)
    
    print(f"Faithful: {faithful.assessment_answer}")
    print(f"Detail: {detail.assessment_answer}")
    print(f"Overall: {overall.assessment_answer}")
    
    
    total = float(detail.assessment_answer) + float(faithful.assessment_answer)*2 + float(overall.assessment_answer)
    
    return total / 5.0

In [54]:
test_example = dspy.Example(question="What do cross encoders do?")
test_pred = dspy.Example(answer="They re-rank documents.")

type(llm_metric(test_example, test_pred))

Test Question: What do cross encoders do?
Predicted Answer: They re-rank documents.
Faithful: 5
Detail: 1
Overall: 5


float

In [68]:
test_example = dspy.Example(question="What do cross encoders do?")
test_pred = dspy.Example(answer="They index data.")

type(llm_metric(test_example, test_pred))

Test Question: What do cross encoders do?
Predicted Answer: They index data.
Faithful: 1
Detail: 1
Overall: 1


float

In [69]:
class GenerateAnswer(dspy.Signature):
    """Answer questions based on the context."""
    
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField()

In [74]:
import dspy

class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        # Initialize the retrieve step to get the top `num_passages` passages
        self.retrieve = dspy.Retrieve(k=num_passages)
        # Initialize the generate step to answer the question
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        # Retrieve context based on the question
        context = self.retrieve(question).passages
        # Generate an answer based on the retrieved context and the question
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

my_question = "What are re-rankers in search engines?"
rag = RAG(num_passages=2)
pred = rag(my_question)

print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: What are re-rankers in search engines?
Predicted Answer: Re-rankers in search engines are tools or models, such as Cross Encoders and Metadata Rankers, that are used to further refine the relevance of search results without requiring specialized training.
Retrieved Contexts (truncated): ['They offer the advantage of further reasoning about the relevance of results without needing specialized training. Cross Encoders can be interfaced with Weaviate to re-rank search results, trading off...', 'Taken directly from the paper, “Our findings indicate that cross-encoder re-rankers can efficiently be improved without additional computational burden and extra steps in the pipeline by explicitly ad...']


In [75]:
uncompiled_rag = RAG(num_passages=3)
print(uncompiled_rag("What are Cross Encoders?"))

Prediction(
    context=['[Cross Encoders](#cross-encoders) (collapsing the use of Large Language Models for ranking into this category as well)\n1. [Metadata Rankers](#metadata-rankers)\n1. [Score Rankers](#score-rankers)\n\n## Cross Encoders\nCross Encoders are one of the most well known ranking models for content-based re-ranking. There is quite a collection of pre-trained cross encoders available on [sentence transformers](https://www.sbert.net/docs/pretrained_cross-encoders.html). We are currently envisioning interfacing cross encoders with Weaviate using the following syntax.', 'Bi-Encoders are fast, but are not as accurate as the expensive fisherman aka the Cross-Encoder. Cross-Encoders are time-consuming, like the fisherman who would need to limit the number of fishing rounds they could do. So we can chain those two methods behind each other (see Figure 5). First, you use a Bi-Encoder to retrieve a *list of result candidates*, then you use a Cross-Encoder on this list of candid