In [1]:
import json
import pandas as pd
from uuid import uuid4
from pprint import pprint
from pymongo import MongoClient, DESCENDING
from transformers import AutoTokenizer
from langchain_text_splitters import CharacterTextSplitter
from config import settings
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

collection_name = 'blocks'

connection_string = settings.MONGO_CONNECTION_STRING
client = MongoClient(connection_string)
print("Connected to MongoDB successfully!")

Connected to MongoDB successfully!


## Original data

In [2]:
db = client.archive
docs = db[collection_name].find().sort({'created_at': -1})

In [3]:
df = pd.DataFrame(list(docs)[::-1])
df

Unnamed: 0,_id,content,created_at,creator,topic,main_thread_id
0,c3b41f3c-692d-4a27-9d98-04b1fa0919d0,Hi <@U066Q9JAU3B>\nNice to see you at our proj...,2024-04-11T07:49:45.714149,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
1,bfa16fbb-2201-4863-abfa-6cc9ec25ce33,"Hi <@UU43NJY8K>,\nThanks for inviting us.",2024-04-11T08:22:09.374109,U066Q9JAU3B,Project Kickoff and Preparation,acc7439c-b
2,21e66f4a-fbfc-4bc7-91d6-cc54ec427d57,Hi <@U066Q9JAU3B> <@U06855K24SE>\nDon't you mi...,2024-04-11T08:59:32.370529,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
3,effa84d1-cb39-4ea5-937d-95ea328afe5c,Works for me.,2024-04-11T09:02:28.000959,U066Q9JAU3B,Project Kickoff and Preparation,acc7439c-b
4,7e92acd1-4103-49d0-97db-1e4ebd891e69,"Hi Yogesh,\nPlease do not hesitate to share al...",2024-04-12T08:04:20.532329,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
...,...,...,...,...,...,...
245,2348069f-592d-4de9-94ee-136cee9b9907,"No, currently there is no overlap (I need to a...",2024-05-16T07:33:00.645309,U05R0KHPU8N,Going back and forth on the best approach for ...,0a76a334-c
246,6363572d-f176-4851-8592-d46b29c4f6ae,"without an overlapping window, we have no way ...",2024-05-16T11:24:58.207359,U066Q9JAU3B,Going back and forth on the best approach for ...,0a76a334-c
247,da42b289-45b7-4e06-88e6-7b469a362188,"<@U066Q9JAU3B> Indeed, my recent experiments s...",2024-05-16T11:27:03.332109,U05R0KHPU8N,Going back and forth on the best approach for ...,0a76a334-c
248,ce942fb1-0179-4655-8d38-f9f41475af6d,"with the overlap, we can even increase the bat...",2024-05-16T11:30:49.711009,U066Q9JAU3B,Going back and forth on the best approach for ...,0a76a334-c


## Group by topics,

In [4]:
grouped_df = df.groupby(['topic', 'main_thread_id'])['creator'].count().reset_index(name='nmessages_in_topic').sort_values(['nmessages_in_topic'],ascending=False)


stage_grouping = {'$group': {
    '_id': '$main_thread_id',
    'topic': {'$addToSet': '$topic'},
    'topic_messages': {'$push': '$content'},
}}

stage_sorting = {
    "$sort": { "main_thread_id": DESCENDING}
}

pipeline = [
    # stage_matching_ids,
    stage_sorting,
    # stage_projecting
    stage_grouping,
]

result = list(db[collection_name].aggregate(pipeline))
# result = db[collection_name].aggregate(pipeline)

In [28]:
topics = []
for topic_json in result:
    messages = '\n- '.join((message[:380].replace('/n', ' ') for message in topic_json['topic_messages']))
    topic = f'Topic: {topic_json["topic"][0]}\n\n- {messages}'
    topics.append(topic)

topic_messages = '\n\n\n'.join(topics)
# topic_messages = '~`~`~`~`'.join(topics)

In [29]:
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=1900, chunk_overlap=0, separator='\n\n\n'
    # tokenizer, chunk_size=1900, chunk_overlap=0, separator='~`~`~`~`'
)
texts = text_splitter.split_text(topic_messages)
list(map(len, list(map(tokenizer.encode, texts))))

[1873, 1870, 1769, 1383, 1656, 1793, 1764, 1491, 1108]

## Ensure all chunks begin with topic

In [30]:
for text in texts:
    print(text.split('\n')[0])

Topic: Go programming and hybrid search
Topic: Discussion on models and server options
Topic: Greetings and well wishes
Topic: None
Topic: Project Update  REST API and development progress
Topic: MongoDB integration and server api
Topic: Project Kickoff and Preparation
Topic: Meeting preparation and clustering project
Topic: Discussion about Googles Meet link and code sharing
