# MakerSpace Jam

In [3]:
import nest_asyncio

nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## Primary Dependencies and Context Setting

In [10]:
!pip3 install -U -q openai==0.27.8 llama-index==0.8.6 nltk==3.8.1 python-dotenv

### Load the OPENAI API key

In [4]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

### Context setting

In [6]:
from llama_index import ServiceContext
from llama_index.node_parser.simple import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.llms import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding()
chunk_size = 1000
llm = OpenAI(
    temperature=0, 
    model="gpt-3.5-turbo",
    streaming=True
)

service_context = ServiceContext.from_defaults(
    llm=llm, 
    chunk_size=chunk_size,
    embed_model=embed_model
)

text_splitter = TokenTextSplitter(
    chunk_size=chunk_size
)

node_parser = SimpleNodeParser(
    text_splitter=text_splitter
)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


## Data Loading

### Wikipedia

In [16]:
!pip3 install -U -q wikipedia  

### Web

In [45]:
# list of wikipedia pages to index
webpages = [ "https://www.basketball-reference.com/" ]

##### Simple Web Reader

In [None]:
!pip3 install html2text

In [None]:
from llama_index import SimpleWebPageReader, TrafilaturaWebReader

web_docs = SimpleWebPageReader(html_to_text=True).load_data([webpages[0]])
web_docs[0].text

##### Trafilatura Web Reader (seems better!!)

In [40]:
!pip3 install trafilatura

In [None]:
from llama_index import TrafilaturaWebReader

web_docs = TrafilaturaWebReader().load_data([webpages[0]])
web_docs[0].text

## Setup Vector Store

In [None]:
!pip3 install -U -q chromadb==0.4.6 tiktoken==0.4.0 sentence-transformers==2.2.2 pydantic==1.10.11

In [53]:
from llama_index import VectorStoreIndex
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
import chromadb

chroma_client = chromadb.Client()

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


#### Web data store

In [None]:
web_chroma_collection = chroma_client.create_collection("web_data")
web_vector_store = ChromaVectorStore(chroma_collection=web_chroma_collection)
web_storage_context = StorageContext.from_defaults(vector_store=web_vector_store)
web_vector_index = VectorStoreIndex([], storage_context=web_storage_context, service_context=service_context)

In [59]:
for w_doc in web_docs:
    # print(wiki_doc)
    nodes = node_parser.get_nodes_from_documents([w_doc])
    for node in nodes:
        node.metadata = {'title': 'Basketball Stats and History'}
    web_vector_index.insert_nodes(nodes)