In [1]:
%%capture
!pip install qdrant-client pymongo
!pip install langchain_community pypdf langchain_text_splitters langchain_experimental
!pip install llama-index llama-index-vector-stores-qdrant llama-index-readers-file llama-index-embeddings-fastembed qdrant_client fastembed
!pip install openai==1.14.1

# Qdrant Client (Retriever)

In [2]:
import logging
import sys
import os

import qdrant_client
from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Document, Settings
import os
from qdrant_client import QdrantClient
import openai

COLLECTION_NAME = "VCPilot"

# Set up Qdrant client for vector store
qdrant_client = QdrantClient(
    url='',
    api_key="",
)

# Embedding model for vector insertion
embed_model = FastEmbedEmbedding(model_name="nomic-ai/nomic-embed-text-v1.5")
Settings.embed_model = embed_model

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME
)

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)
retriever = index.as_retriever()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

1_Pooling/config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/548M [00:00<?, ?B/s]

model_quantized.onnx:   0%|          | 0.00/138M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
# https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker

In [38]:
d1 = Document(text="asdf")
index.insert(d1)

nodes = retriever.retrieve("as")
nodes[0].text

'asdf'

In [39]:
d2 = Document(text="123")
index.insert(d2)
nodes = retriever.retrieve("12")
nodes[0].text

'123'

## Mongo Reader (Ingestor)

In [3]:
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure

class MongoDBQuery:
    def __init__(self, db_name, collection_name, uri="mongodb://localhost:27017/"):
        self.uri = uri
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = None
        self.db = None
        self.collection = None
        self.connect()

    def connect(self):
        try:
            self.client = MongoClient(self.uri)
            self.client.admin.command('ping')
            self.db = self.client[self.db_name]
            self.collection = self.db[self.collection_name]
        except ConnectionFailure:
            print("Failed to connect to MongoDB")
            raise

    def query(self, query_filter=None):
        if query_filter is None:
            query_filter = {}
        try:
            results = self.collection.find(query_filter)
            return list(results)
        except Exception as e:
            print(f"An error occurred during the query: {e}")
            raise


In [4]:
mongo = MongoDBQuery(
    db_name="arxiv",
    collection_name="papers_for_review",
    uri="",
)
mongo_content = mongo.query(query_filter={"abstract": {"$exists": True}})
len(mongo_content)

5536

## Insert Mongo Documents into indexer

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)
from tqdm.auto import tqdm
from langchain_experimental.text_splitter import SemanticChunker


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

# https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker
# text_splitter = SemanticChunker(
#     embeddings=embed_model,
#     breakpoint_threshold_type="percentile"
# )
use_abstracts = True
use_papers = False
for i, paper in tqdm(enumerate(mongo_content), total=len(mongo_content)):
    if i % 100 == 0:
        print(f"inserted {i} documents")
    if use_abstracts:
        text_content = paper["abstract"]
        doc = Document(text=text_content)
        index.insert(doc)
        # print(f"inserted abstract {doc.id_}")
    if use_papers:
        pdf_loader = PyPDFLoader(paper["pdf_url"])
        documents = pdf_loader.load_and_split(
          text_splitter=text_splitter
        )
        index.insert(documents)

  0%|          | 0/5536 [00:00<?, ?it/s]

inserted 0 documents
inserted 100 documents
inserted 200 documents
inserted 300 documents
inserted 400 documents
inserted 500 documents


In [14]:
print(paper["abstract"])

A major goal in neuroscience is to discover neural data representations that
generalize. This goal is challenged by variability along recording sessions
(e.g. environment), subjects (e.g. varying neural structures), and sensors
(e.g. sensor noise), among others. Recent work has begun to address
generalization across sessions and subjects, but few study robustness to sensor
failure which is highly prevalent in neuroscience experiments. In order to
address these generalizability dimensions we first collect our own
electroencephalography dataset with numerous sessions, subjects, and sensors,
then study two time series models: EEGNet (Lawhern et al., 2018) and TOTEM
(Talukder et al., 2024). EEGNet is a widely used convolutional neural network,
while TOTEM is a discrete time series tokenizer and transformer model. We find
that TOTEM outperforms or matches EEGNet across all generalizability cases.
Finally through analysis of TOTEM's latent codebook we observe that
tokenization enables genera

In [7]:
retriever.retrieve("neural data")

[NodeWithScore(node=TextNode(id_='b2f55837-d241-4508-88fd-e2ac30dca4dc', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='adccb960-2792-45c1-a34f-a957444963e6', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='acea491b80dcc760c457b6eb57fb6a9e13b77506c682cf1a9591821812a93273')}, text="A major goal in neuroscience is to discover neural data representations that\ngeneralize. This goal is challenged by variability along recording sessions\n(e.g. environment), subjects (e.g. varying neural structures), and sensors\n(e.g. sensor noise), among others. Recent work has begun to address\ngeneralization across sessions and subjects, but few study robustness to sensor\nfailure which is highly prevalent in neuroscience experiments. In order to\naddress these generalizability dimensions we first collect our own\nelectroencephalography dataset with numerous sessions, subjects, a

In [12]:
from langchain import PromptTemplate

prompt_template = PromptTemplate.from_template(
    template = """
USE TRIZ thinking
- step 1
- step 2

DOCUMENTS
{context}

"""
)

def get_relevant_documents(question):
    relevant_documents = "\n".join(
        list(map(
            lambda node: node.text.replace("\n", " "),
            sorted(
              retriever.retrieve(question),
              key=lambda node: node.score,
              reverse=True
            ),
        ))
    )
    return relevant_documents

relevant_documents = get_relevant_documents("neural data")

prompt = prompt_template.format(
    context=relevant_documents,
)

client = openai.OpenAI(
    base_url = "https://api.fireworks.ai/inference/v1",
    api_key="",
)
response = client.chat.completions.create(
  model="accounts/fireworks/models/mixtral-8x7b-instruct",
  temperature=0,
  max_tokens=4096,
  messages=[{
    "role": "user",
    "content": prompt,
  }],
)
print(response.choices[0].message.content)


TRIZ (Theory of Inventive Problem Solving) is a problem-solving, analysis, and forecasting tool derived from the study of patterns of invention in the global patent literature. Here's how you can apply TRIZ thinking to the given problem:

Step 1: Identify the problem

The problem is the lack of robustness of neural data representations to sensor failure in neuroscience experiments. While recent work has addressed generalization across sessions and subjects, there is a need to study and improve the robustness to sensor failure.

Step 2: Apply TRIZ tools to solve the problem

1. Contradictions analysis: Identify the conflicting requirements in the system. Here, the need for accurate neural data representations conflicts with the susceptibility to sensor failure.
2. 40 principles of invention: Explore TRIZ's 40 principles to find a suitable solution. Principle 1 (Segmentation) and Principle 15 (Dynamics) could be relevant here.
	* Segmentation: Divide the system into independent parts to 

In [13]:
prompt

"\nUSE TRIZ thinking \n- step 1\n- step 2\n\nDOCUMENTS\nA major goal in neuroscience is to discover neural data representations that generalize. This goal is challenged by variability along recording sessions (e.g. environment), subjects (e.g. varying neural structures), and sensors (e.g. sensor noise), among others. Recent work has begun to address generalization across sessions and subjects, but few study robustness to sensor failure which is highly prevalent in neuroscience experiments. In order to address these generalizability dimensions we first collect our own electroencephalography dataset with numerous sessions, subjects, and sensors, then study two time series models: EEGNet (Lawhern et al., 2018) and TOTEM (Talukder et al., 2024). EEGNet is a widely used convolutional neural network, while TOTEM is a discrete time series tokenizer and transformer model. We find that TOTEM outperforms or matches EEGNet across all generalizability cases. Finally through analysis of TOTEM's lat