# MakerSpace Jam

In [7]:
import nest_asyncio

nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## Primary Dependencies and Context Setting

In [8]:
!pip3 install -U -q openai==0.27.8 llama-index==0.8.6 nltk==3.8.1 python-dotenv

### Load the OPENAI API key

In [9]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

### Load token list

In [10]:
import pandas as pd

sheet_name = "GSW_token_list"
workbook_id = '1MB1ZsQul4AB262AsaY4fHtGW4HWp2-56zB-E5xTbs2A'
url = f'https://docs.google.com/spreadsheets/d/{workbook_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Staff,Job title,Token
0,Klay Thompson,Player,1bd8c9ee72e314e1dd8b2442e568fc3d
1,Jonathan Kuminga,Player,1bd8c9ee72e314e1dd8b2442e568fc3d
2,Chris Paul,Player,1bd8c9ee72e314e1dd8b2442e568fc3d
3,Stephen Curry,Player,1bd8c9ee72e314e1dd8b2442e568fc3d
4,Moses Moody,Player,1bd8c9ee72e314e1dd8b2442e568fc3d


In [11]:
token_list = df['Token'].tolist()

## Data Loading

### Webpage

Using the Trafilatura Web Reader.

In [12]:
!pip3 install trafilatura



In [13]:
# list of webpages to index
webpages = [ "https://www.basketball-reference.com/" ]

In [14]:
from llama_index import TrafilaturaWebReader

web_docs = TrafilaturaWebReader().load_data([webpages[0]])
web_docs[0].text

"Basketball Stats and History Statistics, scores, and history for the NBA, ABA, WNBA, and top European competition.\nEvery NBA & Every WNBA Player\nPlay Immaculate Grid\nPut your men's basketball knowledge to the test with our daily men's basketball trivia game. Can you complete the grid?\nRecent DebutsChance Comanche (POR), Jacob Gilyard (MEM), RaiQuan Gray (BRK), Justin Minaya (POR), Donovan Williams (ATL) and Jeenathan Williams (POR)\nNBA & ABA, WNBA, NBL, G League, and top International players\nIncludes indexed lists of players. International leagues include top European leagues and EuroLeague and EuroCup competitions, as well as China's CBA, Australia's NBL, and Men's Olympics.\nIn MemoriamHenry Dickerson, Charles Balentine, George Wilson, Henry Logan, Dedric Willoughby and Nikki McCray\nBorn On This DayRoger Jorgensen, Jesse Dark, Sam Mitchell, Red Owens, Marcus Morris and Tiny Archibald\nPlayer pages include basic statistics and links to player's game logs, splits, advanced sta

In [15]:
from llama_index.node_parser.simple import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size=1000 ### YOUR CODE HERE
)

node_parser = SimpleNodeParser(
    text_splitter=text_splitter ### YOUR CODE HERE
)

# parse nodes from workbooks and insert into vector index
for w_doc in web_docs:
    nodes = node_parser.get_nodes_from_documents([w_doc])
    for node in nodes:
        node.metadata = {'title': 'Basketball Stats and History',
                         'type': 'webpage',
                         'url': 'https://www.basketball-reference.com/',
                         'description': 'Basketball Stats and History',
                         'accessibility': 'public'
                        }

### Online Workbooks

In [16]:
!pip3 install -q -U pandas

In [17]:
import pandas as pd

def get_df_from_workbook(sheet_name,
                         workbook_id = '1MB1ZsQul4AB262AsaY4fHtGW4HWp2-56zB-E5xTbs2A'):
    url = f'https://docs.google.com/spreadsheets/d/{workbook_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
    return pd.read_csv(url)

In [18]:
sheet_names = ['Project_Metadata', 'Teams', 'Players_2023-24', 'Schedule_2023-24', 'Player_Stats_2022-23_(Playoffs)', 'Player_Stats_2022-23_(Regular_Season)']
dict_of_dfs = {sheet: get_df_from_workbook(sheet) for sheet in sheet_names}
dict_of_dfs['Project_Metadata'].head()

Unnamed: 0,Sheets,Description
0,Teams,All current NBA teams with historical stat totals
1,Players 2023-24,Current Player Data including #\tPlayer\tPos\t...
2,Schedule 2023-24,All scheduled games in the 2023-24 season with...
3,Player Stats 2022-23 (Playoffs),Player Stats 2022-23 (Playoffs)
4,Player Stats 2022-23 (Regular Season),Player Stats 2022-23 (Regular Season)


### Email Samples

In [19]:
from llama_index import Document
from llama_index.schema import MetadataMode

In [20]:
docEmailSample = Document(
    text="Hey Steph, let's grab dinner after the game on October 24th, KD of the Phoenix Suns", 
    metadata={
        'type': 'email',
        'datetime': "2023-09-01T15:30:00Z",
        'email_from': 'kevin.durant@suns.nba',
        'email_to': 'stephen.curry@warriors.nba',
        "token_list": token_list,
        'team': 'warriors'
    }
)
docEmailSample2 = Document(
    text="Yo, you were a monster last year, can't wait to play against you in the opener! Bron Bron", 
    metadata={
        'type': 'email',
        'datetime': "2023-09-01T02:30:00Z",
        'email_from': 'lebron.james@lakers.nba',
        'email_to': 'nikola.jokic@nuggets.nba',
        "token_list": token_list,
        'team': 'nuggets'
    }
)
docScheduleSample = Document(
    text="""
        {
            "Date": "Tue Oct 24 2023",
            "Start (ET)": "7:30p",
            "Visitor/Neutral": "Los Angeles Lakers",
            "PTS": "",
            "Home/Neutral": "Denver Nuggets",
            "PTS": "",
            "Attend.": "",
            "Arena": "Ball Arena",
            "Notes": ""
        }
    """,
    metadata={
        'type': 'schedule',
        "token_list": token_list,
        'teams': 'lakers' # idk how to get list of metadata
    }
)
docScheduleSample2 = Document(
    text="""
        {
            "Date": "Tue Oct 24 2023",
            "Start (ET)": "10:00p",
            "Visitor/Neutral": "Phoenix Suns",
            "PTS": "",
            "Home/Neutral": "Golden State Warriors",
            "PTS": "",
            "Attend.": "",
            "Arena": "Chase Center",
            "Notes": ""
        }
    """,
    metadata={
        'type': 'schedule',     
        "token_list": token_list,
        'team': 'warriors' # idk how to get list of metadata
############################ YOU ASK(??) FOR SMTH LIKE THIS:
        # 'list': [
        #     {'team': 'warriors'}, 
        #     {'team': 'lakers'},
        #     {'team': 'nuggets'},
        # ]
    }
)

docAdditionalSamples = [docEmailSample, docEmailSample2, docScheduleSample, docScheduleSample2]

### Example to play with

In [21]:
docPano = Document(
    text='NBA Player born Nov 1989 in Greece', 
    metadata={
        'type': 'player', 
        'number': '42', 
        'team': 'surfers', 
        'position': 'floating point guard',
        'specialty': 'eurostepping',
        'dress-style': 'barefoot',
        'secret': 'loves pizza'
    }
)
docChristos = Document(
    text='NBA Player born Dec 1994 in the USA', 
    metadata={
        'type': 'player', 
        'number': '3', 
        'team': 'climbers', 
        'position': 'strong forward pass',
        'specialty': 'ally-oop',
        'dress-style': 'pink-on-Wednesdays',
        'secret': 'loves cats, candy and cartoons'
    },
    excluded_llm_metadata_keys=['secret'],
    metadata_seperator="::",
    metadata_template="{key}=>{value}",
    text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)

docExamplesToPlayWith = [docPano, docChristos]

print("The LLM sees this: \n", docChristos.get_content(metadata_mode=MetadataMode.LLM))
print("The Embedding model sees this: \n", docChristos.get_content(metadata_mode=MetadataMode.EMBED))

The LLM sees this: 
 Metadata: type=>player::number=>3::team=>climbers::position=>strong forward pass::specialty=>ally-oop::dress-style=>pink-on-Wednesdays
-----
Content: NBA Player born Dec 1994 in the USA
The Embedding model sees this: 
 Metadata: type=>player::number=>3::team=>climbers::position=>strong forward pass::specialty=>ally-oop::dress-style=>pink-on-Wednesdays::secret=>loves cats, candy and cartoons
-----
Content: NBA Player born Dec 1994 in the USA


## Indexing

### Context setting

In [22]:
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding()
chunk_size = 1000
llm = OpenAI(
    temperature=0, 
    model="gpt-3.5-turbo",
    streaming=True
)

service_context = ServiceContext.from_defaults(
    llm=llm, 
    chunk_size=chunk_size,
    embed_model=embed_model
)

### Vector Store

This is intended to be a global vector store to insert the nodes from all data sources

In [23]:
!pip3 install -U -q chromadb==0.4.6 tiktoken==0.4.0 sentence-transformers==2.2.2 pydantic==1.10.11

In [24]:
from llama_index import VectorStoreIndex
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
import chromadb

In [25]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("all_data")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex([], storage_context=storage_context, service_context=service_context)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


### Populate Vector Store with Nodes

Parse nodes for each loaded data source and insert it to the vector store.

In [26]:
# vector_index.insert_nodes(nodes) 
# vector_index.insert_nodes(docExamplesToPlayWith)
vector_index.insert_nodes(docAdditionalSamples)

ValueError: Value for metadata token_list must be one of (str, int, float, None)

### Setup Metadata Filtering

In [27]:
from llama_index.tools import FunctionTool
from llama_index.vector_stores.types import (
    VectorStoreInfo,
    MetadataInfo,
    ExactMatchFilter,
    MetadataFilters,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine

from typing import List, Tuple, Any
from pydantic import BaseModel, Field

# top_k = 3

vector_store_info = VectorStoreInfo(
    content_info="information about NBA players, teams, games, organizations, etc.",
    metadata_info=[MetadataInfo(
        name="title",
        type="str",
        description="info of player, team, organization, games, etc.",
    )]
)

## Query Engine

In [36]:
query_engine = vector_index.as_query_engine(
    similarity_top_k=2,
    vector_store_query_mode="default",
    filters=MetadataFilters(
        filters=[
            ExactMatchFilter(key="team", value="warriors"),
            [ExactMatchFilter(key="token_list", value=v)
             for v in token_list]
        ]
    ),
    alpha=None,
    doc_ids=None,
    # chat_mode='react',
)

In [37]:
# response = query_engine.query("Which teams are we playing this month? Do I have any email communication indicating a meeting with any of the players of those teams")
response = query_engine.query("Which teams are we playing this month?")
print(response.response)
response.metadata

We are playing against the Phoenix Suns this month.


{'92593b92-8af4-46ed-9e13-1fd246f9dda0': {'type': 'email',
  'datetime': '2023-09-01T15:30:00Z',
  'email_from': 'kevin.durant@suns.nba',
  'email_to': 'stephen.curry@warriors.nba',
  'team': 'warriors'},
 '06a69ec8-3ba5-44c1-964f-837f04e4bf5f': {'type': 'schedule',
  'team': 'warriors'}}