In [167]:
import pandas as pd
import numpy as np
df = pd.read_json("quotes.jsonl", lines=True)

In [168]:
df.head()

Unnamed: 0,quote,author,tags
0,“Be yourself; everyone else is already taken.”,Oscar Wilde,"[be-yourself, gilbert-perreira, honesty, inspi..."
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe,"[best, life, love, mistakes, out-of-control, t..."
2,“Two things are infinite: the universe and hum...,Albert Einstein,"[human-nature, humor, infinity, philosophy, sc..."
3,"“So many books, so little time.”",Frank Zappa,"[books, humor]"
4,“A room without books is like a body without a...,Marcus Tullius Cicero,"[books, simile, soul]"


In [169]:
df.shape

(2508, 3)

In [170]:
df.isnull().sum()

quote     0
author    0
tags      0
dtype: int64

In [171]:
df["quote"] = df["quote"].str.lower()

In [172]:
df["author"] = df["author"].str.lower()

In [173]:
df["tags"] = df["tags"].apply(lambda x: [tag.lower() for tag in x])

In [174]:
df.sample()

Unnamed: 0,quote,author,tags
1752,“those who know do not speak. those who speak ...,"lao tsu,","[philosophy, wayfinding]"


In [175]:
df.to_csv("cleaned_quotes.csv", index=False)

In [176]:
# pip install sentence-transformers

In [177]:
df = pd.read_csv("cleaned_quotes.csv")

In [178]:
def merge_fields(row):
    return f"{row['quote']} - {row['author']} | tags: {', '.join(eval(row['tags']))}"

In [179]:
df['full_text'] = df.apply(merge_fields, axis=1)

In [180]:
df['full_text'].sample()

975    “you pierce my soul. i am half agony, half hop...
Name: full_text, dtype: object

In [181]:
from sentence_transformers import SentenceTransformer

In [182]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [183]:
corpus = df['full_text'].tolist()
embeddings = model.encode(corpus, show_progress_bar=True)

Batches: 100%|█████████████████████████████████████████████████████████████████████████| 79/79 [00:28<00:00,  2.75it/s]


In [185]:
np.save("quote_embeddings.npy", embeddings)
df.to_csv("quote_metadata.csv", index=False)
model.save("quote_embedding_model")

In [186]:
# pip install faiss-cpu

In [187]:
embeddings = np.load("quote_embeddings.npy")
df = pd.read_csv("quote_metadata.csv")
model = SentenceTransformer("quote_embedding_model")

In [188]:
import faiss

embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

In [204]:
def retrieve_quotes(query, k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, k)

    results = df.iloc[indices[0]].copy()

    # Optional: Normalize similarity (if you use it somewhere else)
    if np.max(distances[0]) != 0:
        results['similarity_score'] = 1 - distances[0] / np.max(distances[0])
    else:
        results['similarity_score'] = 1.0

    # Ensure 'tags' are Python lists, not strings
    if isinstance(results.iloc[0]["tags"], str):
        results["tags"] = results["tags"].apply(eval)

    return results


In [205]:
# pip install openai

In [206]:
# pip install replicate

In [207]:
import replicate

import os
os.environ["REPLICATE_API_TOKEN"] = "r8_QnpZp5ohlk3ts9s4SvBJAJMkEnr81ic3psh0N"

client = replicate.Client()

def generate_response_replicate(query, retrieved_df):
    context = "\n".join(
        f"Quote: {row['quote']}\nAuthor: {row['author']}\nTags: {row['tags']}"
        for _, row in retrieved_df.iterrows()
    )

    prompt = f"""
You are a strict JSON-generating assistant.

User Query: "{query}"

You are given the following context containing quotes. Return only the quotes that best match the query.

Respond ONLY with a valid JSON array using the following format:
[
  {{
    "quote": "string",
    "author": "string",
    "tags": ["string", ...]
  }},
  ...
]

Do NOT include any explanation or commentary. Return only a JSON array.

Context:
{context}
"""

    output = replicate.run(
        "stability-ai/stablelm-tuned-alpha-7b:943c4afb4d0273cf1cf17c1070e182c903a9fe6b372df36b5447cf45935c42f2",
        # "replicate/flan-t5-xl:eec2f71c986dfa3b7a5d842d22e1130550f015720966bec48beaae059b19ef4c",
        input={
            "prompt": prompt,
            "max_tokens": 500,
            "temperature": 0.7,
            "top_p": 0.9
        }
    )

    return "".join(output)
    



In [208]:
def rag_pipeline(query, k=5):
    retrieved_df = retrieve_quotes(query, k)
    llm_response = generate_response_replicate(query, retrieved_df)
    return llm_response, retrieved_df

In [209]:
query = "Motivational quotes tagged 'accomplishment'"
response_json, source_quotes = rag_pipeline(query) 

print("LLM Response:")
print(response_json)

LLM Response:
Here's an example output from generating a list of motivational quotes based on the input provided: [{"quote":"To succeed without success is meaningless; it signifies nothing.","author":"Albert Einstein","tags":["inspirational"]}, {"quote":"Belief gives directionality toward purpose - when we believe our path has meaning beyond ourselves alone—we’re making progress towards some greater goal.","author":"Buddhism","tags":["inspirational"]}, {"quote":"Be content


In [210]:
print(source_quotes)

                                                  quote               author  \
990   “success is not how high you have climbed, but...      roy t. bennett,   
2421  “the starting point of all achievement is desi...       napoleon hill,   
1118  “be grateful for what you already have while y...      roy t. bennett,   
749   “believe in yourself. you are braver than you ...      roy t. bennett,   
23    “to be yourself in a world that is constantly ...  ralph waldo emerson   

                                                   tags  \
990   [inspiration, inspirational, inspirational-att...   
2421                                    [inspirational]   
1118  [goal, goals, gratitude, happiness, inspiratio...   
749   [achievement, believe-in-yourself, brave, cour...   
23    [accomplishment, be-yourself, conformity, indi...   

                                              full_text  similarity_score  
990   “success is not how high you have climbed, but...          0.103744  
2421  “the s

In [211]:
faiss.write_index(index, "faiss_index.index")