#Streamlit with Retrieval-Augmented Generation (RAG) Using Project Datasets

#Step 1: Environment Setup

In [1]:
!pip install pandas openpyxl numpy matplotlib seaborn scikit-learn




In [2]:
!pip install streamlit wikipedia-api sentence-transformers pyngrok

Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylin

In [3]:
!pip install sentence-transformers faiss-cpu pandas openpyxl


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


#Step 2: Implement the Retrieval-Augmented Generation Pipeline

Load and Preprocessing Dataset

In [4]:
import pandas as pd

# Load the dataset
file_path = "/content/online_retail_II.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Display the first few rows
df.head()


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


Load Sentence Transformer & Generate Embeddings

In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Small & efficient model

# Select a text column to generate embeddings (modify if needed)
text_data = df["Description"].astype(str).tolist()

# Generate embeddings
embeddings = model.encode(text_data, convert_to_numpy=True)

# Save embeddings for future use
np.save("/content/text_embeddings.npy", embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 Build and Store a FAISS Index

In [6]:
import faiss

# Define the dimensionality of embeddings
dimension = embeddings.shape[1]

# Create a FAISS index (L2 normalized for better accuracy)
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(embeddings)

# Save the FAISS index for reuse
faiss.write_index(index, "/content/faiss_index.bin")


Implementing a Retrieval Function

In [15]:
import faiss
import numpy as np

def retrieve_similar_texts(query, top_k=5):
    # Load the FAISS index
    index = faiss.read_index("/content/faiss_index.bin")

    # Convert query into an embedding
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Increase search space to fetch more diverse results
    search_k = min(len(text_data), top_k * 3)  # Fetch 3x candidates
    distances, indices = index.search(query_embedding, search_k)

    # Retrieve unique results from the dataset
    seen = set()
    results = []

    for i in indices[0]:
        if i < len(text_data) and text_data[i] not in seen:
            results.append(text_data[i])
            seen.add(text_data[i])
        if len(results) == top_k:
            break  # Stop once we collect enough unique results

    # Print results for debugging
    print(f"🔍 Query: {query}")
    print(f"📌 Most Relevant Results: {results}")

    return results

# Example Query
query = "bag"
top_results = retrieve_similar_texts(query)

print("🔍 Most Relevant Results:", top_results)


🔍 Query: bag
📌 Most Relevant Results: ['POSY SHOPPER BAG']
🔍 Most Relevant Results: ['POSY SHOPPER BAG']


#Step3: Build the Streamlit Interface

In [16]:
%%writefile app.py
import streamlit as st
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load the dataset
file_path = "/content/online_retail_II.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Remove duplicate descriptions
df = df.drop_duplicates(subset=["Description"])
text_data = df["Description"].astype(str).tolist()

# Load a more powerful Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Generate embeddings
embeddings = model.encode(text_data, convert_to_numpy=True)

# Create and save FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
faiss.write_index(index, "/content/faiss_index.bin")

# Function to retrieve similar texts
def retrieve_similar_texts(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i in indices[0]:
        if i < len(text_data):  # Prevent index errors
            results.append(text_data[i])

    return results

# Streamlit App UI
st.title("🔍 AI-Powered Product Search")

# Text input for user query
user_query = st.text_input("Enter a product name or description:")

# Optional filters
top_k = st.slider("Number of results", 1, 10, 5)
category_filter = st.selectbox("Filter by Category (Optional)", ["All", "Electronics", "Clothing", "Home Essentials"])

# Search button
if st.button("Search"):
    if user_query:
        results = retrieve_similar_texts(user_query, top_k)
        st.write("### Most Relevant Results:")
        for res in results:
            st.write("- " + res)
    else:
        st.warning("Please enter a query before searching!")


Overwriting app.py


In [17]:
!pip install streamlit pyngrok --quiet

In [18]:
!wget -q -O ngrok.zip https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip

In [19]:
!streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.16.154.239:8501[0m
[0m
[34m  Stopping...[0m
^C


#Step 4: Deploy the Application Using ngrok

In [20]:
!pip install pyngrok




In [21]:
!ngrok authtoken 2svEijc2vzFgz33NDuZwlBEfdjL_7sJudSucFk2B48JkFayyZ


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [22]:
!curl https://loca.lt/mytunnelpassword

34.16.154.239

In [23]:
!streamlit run app.py &>/content/logs.txt &

In [24]:
!npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20Gy

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0Kyour url is: https://clean-rice-obey.loca.lt
^C
