In [7]:
import os

from llama_index.vector_stores import AstraDBVectorStore
from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Document,
)
from llama_index.llama_dataset import download_llama_dataset
from dotenv import load_dotenv, find_dotenv

In [8]:
load_dotenv(find_dotenv())

True

In [9]:
from app.engine.loader import get_documents

docs = get_documents()

In [10]:
from app.engine.context import create_service_context, create_storage_context

service_context = create_service_context()  
storage_context = create_storage_context()

In [11]:
index = VectorStoreIndex.from_documents(
    documents=docs,
    service_context=service_context,
    storage_context=storage_context,
)

In [None]:
engine = index.as_chat_engine(similarity_top_k=5, chat_mode="condense_plus_context")
engine.chat("What's the integration of llama index?")

In [None]:
l = index.as_retriever().retrieve("What's the integration of llama index?")

In [None]:
l

In [9]:
from llama_index.node_parser import SemanticSplitterNodeParser

splitter = SemanticSplitterNodeParser(embed_model=service_context.embed_model)

nodes = splitter.get_nodes_from_documents([Document.example()])

In [11]:
for node in nodes:
    print(node.text)
    print("-" * 50)


Context
LLMs are a phenomenal piece of technology for knowledge generation and reasoning.
They are pre-trained on large amounts of publicly available data.
How do we best augment LLMs with our own private data?
We need a comprehensive toolkit to help perform this data augmentation for LLMs.

Proposed Solution
That's where LlamaIndex comes in. LlamaIndex is a "data framework" to help
you build LLM  apps. It provides the following tools:

Offers data connectors to ingest your existing data sources and data formats
(APIs, PDFs, docs, SQL, etc.)
Provides ways to structure your data (indices, graphs) so that this data can be
easily used with LLMs.
Provides an advanced retrieval/query interface over your data:
Feed in any LLM input prompt, get back retrieved context and knowledge-augmented output.

--------------------------------------------------
Allows easy integrations with your outer application framework
(e.g. with LangChain, Flask, Docker, ChatGPT, anything else).
LlamaIndex provides

In [13]:

from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser

SYSTEM_MESSAGE = """\
You are an expert in describing a sequence of continuous frames in a video. Given
several frames, provide a alternative text corresponding to the content, action, phenomenon, or
event in the video. 

The resulting text will be used as search index for the video chunk; therefore, it should be
concise and descriptive.
"""


def get_frames_to_text_chain():
    vision_llm = ChatOpenAI(
        model_name="gpt-4-vision-preview"
    )

    prompt = ChatPromptTemplate.from_messages(
        [
            SystemMessage(content=SYSTEM_MESSAGE),
            MessagesPlaceholder(variable_name="frames")
        ]
    )

    return prompt | vision_llm | StrOutputParser()

In [14]:
chain = get_frames_to_text_chain()

In [17]:
chain.invoke({
    "frames": [
        HumanMessage(content="No image provided"),
    ]
})

'To provide a description, I would need some information about the content of the video'