In [1]:
# ! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

In [85]:
import spacy
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from transformers import pipeline

In [86]:
# pip install sentence-transformers

In [87]:
# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load diary data from CSV
df = pd.read_csv("/Users/pandhari/ai-diary-project/Data/diary_dataset.csv", parse_dates=["Date"])

def extract_entities(text):
    """Extract named entities from text."""
    doc = nlp(text)
    entities = {"PERSON": [], "ORG": [], "GPE": [], "EVENT": [], "DATE": []}
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
    return entities

In [88]:
# Process diary entries
data = []
embeddings = []

for _, row in df.iterrows():
    entities = extract_entities(row["Entry"])
    embedding = embed_model.encode(str(row["Date"]) + row["Entry"]).tolist()
    embeddings.append(embedding)
    
    data.append({
        "date": row["Date"],
        "entry": row["Entry"],
        "entities": entities
    })

df = pd.DataFrame(data)

In [89]:
# Create FAISS index for fast retrieval
embedding_dim = len(embeddings[0])  # Get embedding size
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance (Euclidean)
index.add(np.array(embeddings, dtype=np.float32))

def search_diary(query, top_k=1):
    """Search diary entries using semantic search."""
    query_vec = embed_model.encode([query])
    distances, indices = index.search(query_vec, top_k)
    results = [df.iloc[i]["entry"] for i in indices[0] if i < len(df)]
    return results

from langchain_community.llms import Ollama
llm = Ollama(model="llama3.2")

In [90]:
# Define LLM prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are an AI assistant that answers questions based on a personal diary.
    Here are some relevant diary entries:
    {context}
    
    Based on these, answer the following question:
    {question}
    """
)

llm_chain = LLMChain(llm=llm, prompt=prompt_template)


In [91]:
def ask_ai_diary(query):
    retrieved_entries = search_diary(query)
    context = "\n".join(retrieved_entries) if retrieved_entries else "No relevant diary entries found."
    response = llm_chain.run({"context": context, "question": query})
    return response

In [92]:
# Example query
query = "What activities I did on April 1st, 2024?"
print("🤖 AI Diary Response:", ask_ai_diary(query))

🤖 AI Diary Response: I don't have any information about specific dates or events from my personal diary. The entries you provided only mention that it was a day off, but they don't specify the date.

However, based on the content of the diary entries, I can tell you that the activities mentioned were:

* Sipping coffee at a cozy café
* Watching people go about their day
* Taking a spontaneous walk in the park

Unfortunately, without more specific information, I couldn't determine which April 1st, 2024, these activities took place on.
