## Install Libraries

In [6]:
! pip install --quiet --upgrade chromadb
! pip install --quiet --upgrade pandas
! pip install --quiet --upgrade llama-index
! pip install --quiet --upgrade openai

## Setup Model
#### Must create environment variable with key https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety

In [None]:
import os
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]

## Create Chroma Client

In [3]:
import chromadb
chroma_client = chromadb.Client()


## Create Collection

In [12]:
collection = chroma_client.create_collection(name="my_collection")

## Add Documents to Collection

#### Create Dataframes

In [4]:
import pandas as pd

relativePath="data/"

events = pd.read_csv(f"{relativePath}events.csv")
ostem = pd.read_csv(f"{relativePath}ostems.csv")
pathways = pd.read_csv(f"{relativePath}pathways.csv")
research = pd.read_csv(f"{relativePath}solicitations.csv")

#### Events

In [None]:
idEvents=events["ID"]
titleEvents=events["Title"]
descriptionEvents=events["Description"]
urlEvents=events["URL"]
typeEvents=events["Type"]

display(events)

#### OSTEM

In [None]:
idOstem=ostem["ID"]
titleOstem=ostem["Title"]
descriptionOstem=ostem["Description"]
urlOstem=ostem["URL"]
typeOstem=ostem["Type"]

display(ostem)

#### Pathways

In [None]:
idPathways=pathways["ID"]
titlePathways=pathways["Title"]
educationLevel=pathways["Education Level"]
urlPathways=pathways["URL"]
majorsPathways=pathways[:"Majors"]

display(pathways)

#### Research

In [None]:
idResearch=research["ID"]
titleResearch=research["Solicitation Title"]
status=research["Status"]
idResearch=research["Solicitation ID"]
urlResearch=research["URL"]

display(research)

#### Create Nodes

In [None]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions

ostem = pd.read_csv("data/ostems.csv")

# Create Chroma client
client = chromadb.Client()

# Create embedding function (using default)
embedding_function = embedding_functions.DefaultEmbeddingFunction()

# Create collection
collection = client.create_collection(
    name="nasa_ostem",
    embedding_function=embedding_function,
    get_or_create=True
)

# Prepare data for Chroma
documents = ostem['Description'].tolist()
metadatas = ostem.apply(lambda x: {
    'id': str(x['ID']),
    'title': x['Title'],
    'url': x['URL'],
    'type': x['Type']
}, axis=1).tolist()
ids = ostem['ID'].astype(str).tolist()

# Add data to collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)


In [5]:
results = collection.query(
    query_texts=["internship opportunities"],
    n_results=3
)

display(results)


{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}