# MakerSpace Jam

In [1]:
import nest_asyncio

nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## Primary Dependencies and Context Setting

In [2]:
!pip3 install -U -q openai==0.27.8 llama-index==0.8.6 nltk==3.8.1 python-dotenv

### Load the OPENAI API key

In [3]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

### Context setting

In [4]:
from llama_index import ServiceContext
from llama_index.node_parser.simple import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.llms import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding()
chunk_size = 1000
llm = OpenAI(
    temperature=0, 
    model="gpt-3.5-turbo",
    streaming=True
)

service_context = ServiceContext.from_defaults(
    llm=llm, 
    chunk_size=chunk_size,
    embed_model=embed_model
)

text_splitter = TokenTextSplitter(
    chunk_size=chunk_size
)

node_parser = SimpleNodeParser(
    text_splitter=text_splitter
)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


## Data Loading

### Wikipedia

In [5]:
!pip3 install -U -q wikipedia  

## Setup Vector Store

This is intended to be a global vector store to insert the nodes from all data sources

In [7]:
!pip3 install -U -q chromadb==0.4.6 tiktoken==0.4.0 sentence-transformers==2.2.2 pydantic==1.10.11

In [8]:
from llama_index import VectorStoreIndex
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
import chromadb

In [9]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("all_data")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex([], storage_context=storage_context, service_context=service_context)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


### Setup Metadata Filtering

First, we need to create our `VectoreStoreInfo` object which will hold all the relevant metadata we need for each component (in this case title metadata).

Notice that you need to include it in a text list.

In [15]:
from llama_index.tools import FunctionTool
from llama_index.vector_stores.types import (
    VectorStoreInfo,
    MetadataInfo,
    ExactMatchFilter,
    MetadataFilters,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine

from typing import List, Tuple, Any
from pydantic import BaseModel, Field

# top_k = 3

# vector_store_info = VectorStoreInfo(
#     content_info="information about NBA players, teams, games, organizations, etc.",
#     metadata_info=[MetadataInfo(
#         name="title",
#         type="str",
#         description="info of player, team, organization, games, etc.",
#     )]
# )

# INSERTING METADATA WITH EXAMPLES

### Construct Nodes and Insert-to-Store

In [10]:
from llama_index import Document
from llama_index.schema import MetadataMode

In [11]:
docPano = Document(
    text='NBA Player born Nov 1989 in Greece', 
    metadata={
        'type': 'player', 
        'number': '42', 
        'team': 'surfers', 
        'position': 'floating point guard',
        'specialty': 'eurostepping',
        'secret': 'loves pizza'
    }
)
docChristos = Document(
    text='NBA Player born Dec 1994 in the USA', 
    metadata={
        'type': 'player', 
        'number': '3', 
        'team': 'climbers', 
        'position': 'strong forward pass',
        'specialty': 'ally-oop',
        'secret': 'loves cats, candy and cartoons'
    },
    excluded_llm_metadata_keys=['secret'],
    metadata_seperator="::",
    metadata_template="{key}=>{value}",
    text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)

print("The LLM sees this: \n", docChristos.get_content(metadata_mode=MetadataMode.LLM))
print("The Embedding model sees this: \n", docChristos.get_content(metadata_mode=MetadataMode.EMBED))

The LLM sees this: 
 Metadata: type=>player::number=>3::team=>climbers::position=>strong forward pass::specialty=>ally-oop
-----
Content: NBA Player born Dec 1994 in the USA
The Embedding model sees this: 
 Metadata: type=>player::number=>3::team=>climbers::position=>strong forward pass::specialty=>ally-oop::secret=>loves cats, candy and cartoons
-----
Content: NBA Player born Dec 1994 in the USA


In [12]:
docSample = [docPano, docChristos]

In [13]:
vector_index.insert_nodes(docSample)

In [42]:
query_engine = vector_index.as_query_engine(
    similarity_top_k=3,
    vector_store_query_mode="default",
    filters=MetadataFilters(
        filters=[
            ExactMatchFilter(key="team", value="climbers"),
        ]
    ),
    alpha=None,
    doc_ids=None,
)
response = query_engine.query("when was player born?")
print(response.response)
response.metadata

The player was born in December 1994.


{'c391a3b8-334f-4820-80a4-af03e2902704': {'type': 'player',
  'number': '3',
  'team': 'climbers',
  'position': 'strong forward pass',
  'specialty': 'ally-oop',
  'secret': 'loves cats, candy and cartoons'}}

Attempting email doc addition

In [48]:
docEmailSample = Document(
    text="Hey Steph, let's grab dinner after the game on October 24th, KD of the Phoenix Suns", 
    metadata={
        'type': 'email',
        'datetime': "2023-09-01T15:30:00Z",
        'email_from': 'kevin.durant@suns.nba',
        'email_to': 'stephen.curry@warriors.nba',         # idk how to get list of metadata
        'team': 'warriors'
    }
)
docEmailSample2 = Document(
    text="Yo, you were a monster last year, can't wait to play against you in the opener! Bron Bron", 
    metadata={
        'type': 'email',
        'datetime': "2023-09-01T02:30:00Z",
        'email_from': 'lebron.james@lakers.nba',
        'email_to': 'nikola.jokic@nuggets.nba',         # idk how to get list of metadata
        'team': 'nuggets'
    }
)
docScheduleSample = Document(
    text="""
        {
            "Date": "Tue Oct 24 2023",
            "Start (ET)": "7:30p",
            "Visitor/Neutral": "Los Angeles Lakers",
            "PTS": "",
            "Home/Neutral": "Denver Nuggets",
            "PTS": "",
            "Attend.": "",
            "Arena": "Ball Arena",
            "Notes": ""
        }
    """,
    metadata={
        'type': 'schedule',
        'teams': 'lakers' # idk how to get list of metadata
    }
)
docScheduleSample2 = Document(
    text="""
        {
            "Date": "Tue Oct 24 2023",
            "Start (ET)": "10:00p",
            "Visitor/Neutral": "Phoenix Suns",
            "PTS": "",
            "Home/Neutral": "Golden State Warriors",
            "PTS": "",
            "Attend.": "",
            "Arena": "Chase Center",
            "Notes": ""
        }
    """,
    metadata={
        'type': 'schedule',
        'team': 'warriors' # idk how to get list of metadata
    }
)


In [49]:
docAdditionalSamples = [docEmailSample, docEmailSample2, docScheduleSample, docScheduleSample2]
vector_index.insert_nodes(docAdditionalSamples)

In [43]:
# query_engine SAME
response = query_engine.query("Which teams are we playing this month? Do I have any email communication indicating a meeting with any of the players of those teams")
print(response.response)
response.metadata

I'm sorry, but I cannot answer the query as it goes beyond the provided context information.


{'c391a3b8-334f-4820-80a4-af03e2902704': {'type': 'player',
  'number': '3',
  'team': 'climbers',
  'position': 'strong forward pass',
  'specialty': 'ally-oop',
  'secret': 'loves cats, candy and cartoons'}}

In [50]:
query_engine = vector_index.as_query_engine(
    similarity_top_k=1,
    vector_store_query_mode="default",
    filters=MetadataFilters(
        filters=[
            ExactMatchFilter(key="team", value="warriors"),
        ]
    ),
    alpha=None,
    doc_ids=None,
)
# query_engine NEW team=warriors
response = query_engine.query("Which teams are we playing this month? Do I have any email communication indicating a meeting with any of the players of those teams")
print(response.response)
response.metadata

You have an email from Kevin Durant of the Phoenix Suns, who is not a player on any of the teams you are playing this month. Therefore, there is no email communication indicating a meeting with any players from the teams you are playing this month.


{'b6cb9b88-60b2-4c13-9301-656ba4e852ab': {'type': 'email',
  'datetime': '2023-09-01T15:30:00Z',
  'email_from': 'kevin.durant@suns.nba',
  'email_to': 'stephen.curry@warriors.nba',
  'team': 'warriors'}}

# gotta fix this somehow using engineering to tag all teams relating to a node

---