###  Load Data

In [1]:
import pandas as pd

DATA_FILE = "../data/WhatsAppCleaned/WhatsAppCombined.tsv"

def add_context(chat_df, col_to_cat='MESSAGE', new_col_name='CONTEXTUALIZED_MESSAGE', context_len=3, author_col='SENDER'):
  temp_col = col_to_cat+'_TMP'
  chat_df[temp_col] = chat_df[author_col] + ' ~ ' + chat_df[col_to_cat]

  neg_cols_added = [f'{temp_col}_neg_{i}' for i in range(1, 1 + context_len)]
  plus_cols_added = [f'{temp_col}_plus_{i}' for i in range(1, 1 + context_len)]

  for i in range(1, context_len + 1):
    chat_df[f'{temp_col}_plus_{i}'] = chat_df[temp_col].shift(-i)
    chat_df[f'{temp_col}_neg_{i}'] = chat_df[temp_col].shift(i)

  chat_df[new_col_name] = chat_df[[*neg_cols_added, temp_col, *plus_cols_added]].fillna('').agg('\n'.join,
                                                                                                axis=1).str.strip()
  chat_df.drop(columns=[temp_col, *neg_cols_added, *plus_cols_added], inplace=True)
  return chat_df


data = pd.read_csv(DATA_FILE, sep='\t', parse_dates=['DATETIME'])
data = add_context(data, col_to_cat='MESSAGE', new_col_name='CONTEXTUALIZED_MESSAGE', context_len=3)
# save to JSON so it can be read by timestore
data.to_json('../data/WhatsAppCleaned/WhatsAppCombined.json', 'table', index=False)
#preview the csv file
data.head()

  data = pd.read_csv(DATA_FILE, sep='\t', parse_dates=['DATETIME'])
  data.to_json('../data/WhatsAppCleaned/WhatsAppCombined.json', 'table', index=False)


Unnamed: 0,SENDER,MESSAGE,DATETIME,PLATFORM,CHAT,MSG_ID,CONTEXTUALIZED_MESSAGE
0,Arya GJ,Bruv group chat of group chat 😂😂😂,2024-02-24 17:30:00,WhatsApp,Hamza homies,WA Hamza homies 0,Arya GJ ~ Bruv group chat of group chat 😂😂😂\nZ...
1,Zareef Amyeen,since the big chat has lowk been saturated,2024-02-24 17:30:00,WhatsApp,Hamza homies,WA Hamza homies 1,Arya GJ ~ Bruv group chat of group chat 😂😂😂\n\...
2,Soham Raut,Yoo we can talk shit about everyone else letsgooo,2024-02-24 17:32:00,WhatsApp,Hamza homies,WA Hamza homies 2,Zareef Amyeen ~ since the big chat has lowk be...
3,Zareef Amyeen,@14255353544 can you let me in when i get there,2024-02-24 17:32:00,WhatsApp,Hamza homies,WA Hamza homies 3,Soham Raut ~ Yoo we can talk shit about everyo...
4,Soham Raut,Just kidding lmfao,2024-02-24 17:32:00,WhatsApp,Hamza homies,WA Hamza homies 4,Zareef Amyeen ~ @14255353544 can you let me in...


In [2]:
from datetime import datetime, timedelta

from timescale_vector import client

def create_uuid(date_string: str):
  if date_string is None:
    return None
  # 2024-02-24T17:30:00.000
  time_format = "%Y-%m-%dT%H:%M:%S.%f"
  datetime_obj = datetime.strptime(date_string, time_format)
  uuid = client.uuid_from_time(datetime_obj)
  return str(uuid)

def extract_metadata(record, metadata) -> dict:
  # metadata = dict()
  metadata["ID"] = create_uuid(record["DATETIME"])
  metadata["MSG_ID"] = record["MSG_ID"]
  metadata["DATETIME"] = record["DATETIME"] #datetime.strftime(record["DATETIME"], '%Y-%m-%d %H:%M')
  metadata["MESSAGE"] = record["MESSAGE"]
  metadata["SENDER"] = record["SENDER"]
  metadata["PLATFORM"] = record["PLATFORM"]
  metadata["CHAT"] = record["CHAT"]

  del metadata['source']
  del metadata['seq_num']
  return metadata

In [3]:
from langchain_community.document_loaders.json_loader import JSONLoader

# Load data from JSON file and extract metadata
loader = JSONLoader(
  file_path='../data/WhatsAppCleaned/WhatsAppCombined.json',
  jq_schema=".data[]",
  content_key='CONTEXTUALIZED_MESSAGE',
  text_content=True,
  metadata_func=extract_metadata,
)

documents = loader.load()

In [8]:
print(len(documents))
print(documents[0])

6731
page_content='Arya GJ ~ Bruv group chat of group chat 😂😂😂
Zareef Amyeen ~ since the big chat has lowk been saturated
Soham Raut ~ Yoo we can talk shit about everyone else letsgooo
Zareef Amyeen ~ @14255353544 can you let me in when i get there' metadata={'ID': '642b5c00-d37d-11ee-a561-e8861aa0137a', 'MSG_ID': 'WA Hamza homies 0', 'DATETIME': '2024-02-24T17:30:00.000', 'MESSAGE': 'Bruv group chat of group chat 😂😂😂', 'SENDER': 'Arya GJ', 'PLATFORM': 'WhatsApp', 'CHAT': 'Hamza homies'}


In [5]:
# from langchain_text_splitters import CharacterTextSplitter
#
# # Split the documents into chunks for embedding
# CHUNK_SIZE = 10000
# text_splitter = CharacterTextSplitter(
#   chunk_size=CHUNK_SIZE,
#   chunk_overlap=CHUNK_SIZE//5,
# )
# docs = text_splitter.split_documents(documents)

docs = documents

###  Import Libraries and enviornment variables

In [9]:
### LLMs
from langchain_ollama import OllamaEmbeddings, ChatOllama

LLAMA_3B_NAME = 'llama3.2'
DEEPSEEK_1_5B_NAME = 'deepseek-r1:1.5b'

big_llm = ChatOllama(model=LLAMA_3B_NAME)
small_llm = ChatOllama(model=DEEPSEEK_1_5B_NAME)

embed_model = OllamaEmbeddings(model=LLAMA_3B_NAME)

### Create Vectorstore

In [10]:
from langchain_community.vectorstores.timescalevector import TimescaleVector
import os

from dotenv import load_dotenv
load_dotenv()

COLLECTION_NAME = 'timescale_WA_v1'

# Create a Timescale Vector instance from the collection of documents
db = TimescaleVector.from_documents(
  embedding=embed_model,
  ids=[doc.metadata["ID"] for doc in docs],
  documents=docs,
  collection_name=COLLECTION_NAME,
  service_url=os.environ['TIMESCALE_SERVICE_URL'],
  time_partition_interval=timedelta(days=7),
)

### Read Vectorstore

In [21]:
db = TimescaleVector(
    collection_name=COLLECTION_NAME,
    service_url=os.environ['TIMESCALE_SERVICE_URL'],
    embedding=embed_model,
)

# db.create_index(index_type="tsv")

### Question

In [22]:
question = "What were the last 5 ToDos papa gave?"

### Retrieve docs from DB + Add Additional Context

In [23]:
from datetime import datetime
start_dt = datetime(2025, 1, 1)  # Start date = Jan 1, 2025
end_dt = datetime.now() # End date = 30 August 2023, 22:10:35
td = timedelta(days=7)  # Time delta = 7 days

# Set timescale vector as a retriever and specify start and end dates via kwargs
retriever = db.as_retriever(
  search_type="similarity",
  search_kwargs={"start_date": start_dt, "end_date": end_dt, 'k': 10}
)

In [24]:
docs = retriever.invoke(question)

In [25]:
docs[0].metadata['MSG_ID']

'WA Saxena Family 2345'

In [26]:
def retrieve_more_context(msg_id, platform, chat, n_addl_msgs=10):
  """
  Given a message with ID `msg_id`, get the `addl_msgs` preceding and following messages for context

  :param msg_id: the ID of a retrieved message
  :param platform: the platform of the retrieved message
  :param chat: the chat of the retrieved message
  :param n_addl_msgs: number of additional messages before and after msg `msg_id` to retrieve
  :return: a string
  """
  msg_info = data[data['MSG_ID'] == msg_id]

  chat_hist = data[
      (data['PLATFORM'] == platform) &
      (data['CHAT'] == chat)
  ]

  context_lo = max(chat_hist.index[0], msg_info.index[0] - n_addl_msgs)
  context_hi = min(chat_hist.index[-1], msg_info.index[0] + n_addl_msgs)

  within_context_df = data.iloc[context_lo:context_hi, :].copy()
  within_context_df['VERBOSE'] = within_context_df['PLATFORM'] + ' : ' + within_context_df['CHAT'] + '\t' + within_context_df['DATETIME'].dt.strftime('%A %B %d, %Y %H:%M') + '\t' + within_context_df['SENDER'] + ' ~ ' + within_context_df['MESSAGE']

  return within_context_df['VERBOSE'].str.cat(sep='\n')

In [27]:
fuller_context = [(doc.metadata['MSG_ID'], retrieve_more_context(doc.metadata['MSG_ID'], doc.metadata['PLATFORM'], doc.metadata['CHAT'])) for doc in docs]

### Filter Docs w/ LLM

In [29]:
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


# LLM with function call
structured_llm_grader = big_llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

In [45]:
docs_to_use = []

for (msg_id, msg_context) in fuller_context:
    print(msg_context, '\n', '-'*50)
    res = retrieval_grader.invoke({"question": question, "document": msg_context})
    print(res,'\n\n\n')
    if res.binary_score == 'yes':
        docs_to_use.append({'MSG_ID' : msg_id, 'FULL_CONTEXT' : msg_context})

WhatsApp : Saxena Family	Friday January 03, 2025 00:26	Ma ~ Signing off
See you all in SFO
WhatsApp : Saxena Family	Friday January 03, 2025 01:17	Papa ~ Landed in Istanbul
Cleared final security at the departure gate for SFO
<Media omitted>
See you both in SFO
WhatsApp : Saxena Family	Friday January 03, 2025 16:26	Ma ~ Nice
Reached sfo
WhatsApp : Saxena Family	Friday January 03, 2025 17:37	Papa ~ Just landed
How are you doing?
How is K Bhai
WhatsApp : Saxena Family	Friday January 03, 2025 17:48	Krishna Saxena ~ Fine. We are waiting inside the airport across from Uber
WhatsApp : Saxena Family	Friday January 03, 2025 17:48	Papa ~ Ok it will be a while
Still taxing
WhatsApp : Saxena Family	Friday January 03, 2025 18:29	Ma ~ Where are you guys now
WhatsApp : Saxena Family	Friday January 03, 2025 18:29	Papa ~ Baggage
Got 1 of 2
WhatsApp : Saxena Family	Friday January 03, 2025 18:48	Aditi ~ what door
we got both bags
WhatsApp : Saxena Family	Friday January 03, 2025 18:48	Krishna Saxena ~ Whe

### Generate Result

In [46]:
from langchain_core.output_parsers import StrOutputParser

# Prompt
system = """You are an assistant for question-answering tasks. Answer the question based upon your knowledge.
Use three-to-five sentences maximum and keep the answer concise."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved documents: \n\n <docs>{documents}</docs> \n\n User question: <question>{question}</question>"),
    ]
)

# Post-processing
def format_docs(docs):
    return "\n".join(f"<doc{i+1}>:\nSource:{doc['MSG_ID']}\nContent:{doc['FULL_CONTEXT']}\n</doc{i+1}>\n" for i, doc in enumerate(docs))

# Chain
rag_chain = prompt | big_llm | StrOutputParser()

# Run
generation = rag_chain.invoke({"documents":format_docs(docs_to_use), "question": question})
print(generation)

Based on the WhatsApp conversation between Krishna Saxena and his parents, I was able to identify the following 5 To-Do tasks given by Papa:

1. **Focus on coursework**: After completing college applications, focus on course work.
2. **Job hunting with batchmates**: Collaborate with batchmates (and seniors) for preparing job interviews and attempting to land a job.
3. **Follow up on internship possibility in Europe**: Send an email to Deepa's friend who had gotten Krishna an opportunity for an internship in Europe, to see what they have to say.
4. **Ensure paper gets accepted** (likely a reference to Krishna's undergraduate thesis or capstone project).
5. (Implicitly mentioned) Take care of himself and manage stress: Papa expressed concern about Krishna being under stress and suggested that he make a short list of things he has to do to finish his undergrad program, implying that he should prioritize self-care and not put too much pressure on himself.


### Check for Hallucinations

In [47]:
# Data model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in 'generation' answer."""

    binary_score: str = Field(
        ...,
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )

# LLM with function call
structured_llm_grader = big_llm.with_structured_output(GradeHallucinations)

# Prompt
system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
    Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n <facts>{documents}</facts> \n\n LLM generation: <generation>{generation}</generation>"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader

response = hallucination_grader.invoke({"documents": format_docs(docs_to_use), "generation": generation})
print(response)

None


### Highlight Used Docs

In [59]:
from typing import List
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate

# Data model
class HighlightDocuments(BaseModel):
    """Return the specific part of a document used for answering the question."""

    Source: List[str] = Field(
        ...,
        description="List of alphanumeric ID of docs used to answers the question"
    )
    Content: List[str] = Field(
        ...,
        description="List of complete conversation contexts that answers the question"
    )
    Segment: List[str] = Field(
        ...,
        description="List of pointed, direct segments from used documents that answer the question"
    )

# parser
parser = PydanticOutputParser(pydantic_object=HighlightDocuments)

# Prompt
system = """You are an advanced assistant for document search and retrieval. You are provided with the following:
1. A question.
2. A generated answer based on the question.
3. A set of documents that were referenced in generating the answer.

Your task is to identify and extract the exact inline segments from the provided documents that directly correspond to the content used to
generate the given answer. The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text
in the provided documents.

Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't used the specific document don't mention it.

Used documents: <docs>{documents}</docs> \n\n User question: <question>{question}</question> \n\n Generated answer: <answer>{generation}</answer>

<format_instruction>
{format_instructions}
</format_instruction>
"""


prompt = PromptTemplate(
    template= system,
    input_variables=["documents", "question", "generation"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Chain
doc_lookup = prompt | big_llm | parser

# Run
lookup_response = doc_lookup.invoke({"documents":format_docs(docs_to_use), "question": question, "generation": generation})

In [60]:
for id, source, segment in zip(lookup_response.Source, lookup_response.Content, lookup_response.Segment):
    print(f"ID: {id}\nSource: {source}\nText Segment: {segment}\n")

ID: doc1
Source: Answer 1
Text Segment: Task 1

ID: doc2
Source: Answer 2
Text Segment: Task 2

