In [1]:
# Mount Google Drive (optional if you want to store/save files)
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies
!pip install google-cloud-storage pandas sentence-transformers faiss-cpu transformers streamlit

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting streamlit
  Downloading streamlit-1.48.0-py3-none-any.whl.metadata (9.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloadin

In [2]:
import sys
sys.path.append('/content/drive/MyDrive/Task3_ArxivChatbot')  # Adjust path accordingly


In [3]:
from utils import load_cs_papers, build_embeddings, build_faiss_index
import faiss
import os

# Load data (adjust path)
df = load_cs_papers("/content/drive/MyDrive/Task3_ArxivChatbot/data/arxiv-metadata-oai-snapshot.json")
print(f"Loaded {len(df)} CS papers")

texts = (df['title'] + ". " + df['abstract']).tolist()

# Build embeddings
embedder, embeddings = build_embeddings(texts)

# Build and save FAISS index
index = build_faiss_index(embeddings)
os.makedirs("models", exist_ok=True)
faiss.write_index(index, "models/faiss_index.bin")

print(f"FAISS index saved with {index.ntotal} vectors")


Loaded 13740 CS papers


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/430 [00:00<?, ?it/s]

FAISS index saved with 13740 vectors


In [6]:
from utils import (
    load_cs_papers,
    build_embeddings,
    build_faiss_index,
    semantic_search,     # <- this must be imported
    get_summarizer,
    summarize_text,
    get_explainer,
    generate_explanation
)

query = "neural networks"


In [7]:
summarizer = get_summarizer()
explainer = get_explainer()

results = semantic_search(query, embedder, index, texts, top_k=3)

print("\nTop results for query:", query)
for i, res in enumerate(results):
    print(f"\nResult {i+1}:\n{res[:500]}...\n")

summary = summarize_text(summarizer, results[0])
print("Summary of first paper:")
print(summary)

explanation = generate_explanation(explainer, query, results)
print("Expert explanation:")
print(explanation)

Device set to use cpu
Device set to use cpu
Your max_length is set to 150, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)



Top results for query: neural networks

Result 1:
Computer Model of a "Sense of Humour". II. Realization in Neural
  Networks.   The computer realization of a "sense of humour" requires the creation of an
algorithm for solving the "linguistic problem", i.e. the problem of recognizing
a continuous sequence of polysemantic images. Such algorithm may be realized in
the Hopfield model of a neural network after its proper modification.
...


Result 2:
Option Pricing Using Bayesian Neural Networks.   Options have provided a field of much study because of the complexity
involved in pricing them. The Black-Scholes equations were developed to price
options but they are only valid for European styled options. There is added
complexity when trying to price American styled options and this is why the use
of neural networks has been proposed. Neural Networks are able to predict
outcomes based on past data. The inputs to the networks here are stock
vo...


Result 3:
Hybrid Neural Network Architectu

Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Expert explanation:
The computer realization of a "sense of Humour" requires the creation of an algorithm for solving the "linguistic problem", i.e. the problem of recognizing a continuous sequence of polysemantic images. Such algorithm may be realized in the Hopfield model of a neural network after its proper modification. Option Pricing Using Bayesian Neural Networks. Options have provided a field of much study because of the complexity involved in pricing them. The Black-Scholes equations were developed to price options but they are only valid for European styled options. There is added complexity when trying to price American styled options and this is why the use of neural networks has been proposed. Neural Networks are able to predict outcomes based on past data. The inputs to the networks here are stock volatility, strike price and time to maturity with the output of the network being the call option price. There are two techniques for Bayesian neural networks used. One is Autom