In [1]:
import json
import pandas as pd
from uuid import uuid4
from pprint import pprint
from pymongo import MongoClient, DESCENDING
from transformers import AutoTokenizer
from config import settings
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

collection_name = 'blocks'

connection_string = settings.MONGO_CONNECTION_STRING
client = MongoClient(connection_string)
print("Connected to MongoDB successfully!")

Connected to MongoDB successfully!


In [2]:
db = client.archive
docs = db[collection_name].find().sort({'created_at': -1})

In [3]:
df = pd.DataFrame(list(docs)[::-1])
df

Unnamed: 0,_id,content,created_at,creator,topic,main_thread_id
0,c3b41f3c-692d-4a27-9d98-04b1fa0919d0,Hi <@U066Q9JAU3B>\nNice to see you at our proj...,2024-04-11T07:49:45.714149,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
1,bfa16fbb-2201-4863-abfa-6cc9ec25ce33,"Hi <@UU43NJY8K>,\nThanks for inviting us.",2024-04-11T08:22:09.374109,U066Q9JAU3B,Project Kickoff and Preparation,acc7439c-b
2,21e66f4a-fbfc-4bc7-91d6-cc54ec427d57,Hi <@U066Q9JAU3B> <@U06855K24SE>\nDon't you mi...,2024-04-11T08:59:32.370529,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
3,effa84d1-cb39-4ea5-937d-95ea328afe5c,Works for me.,2024-04-11T09:02:28.000959,U066Q9JAU3B,Project Kickoff and Preparation,acc7439c-b
4,7e92acd1-4103-49d0-97db-1e4ebd891e69,"Hi Yogesh,\nPlease do not hesitate to share al...",2024-04-12T08:04:20.532329,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
...,...,...,...,...,...,...
245,2348069f-592d-4de9-94ee-136cee9b9907,"No, currently there is no overlap (I need to a...",2024-05-16T07:33:00.645309,U05R0KHPU8N,Going back and forth on the best approach for ...,0a76a334-c
246,6363572d-f176-4851-8592-d46b29c4f6ae,"without an overlapping window, we have no way ...",2024-05-16T11:24:58.207359,U066Q9JAU3B,Going back and forth on the best approach for ...,0a76a334-c
247,da42b289-45b7-4e06-88e6-7b469a362188,"<@U066Q9JAU3B> Indeed, my recent experiments s...",2024-05-16T11:27:03.332109,U05R0KHPU8N,Going back and forth on the best approach for ...,0a76a334-c
248,ce942fb1-0179-4655-8d38-f9f41475af6d,"with the overlap, we can even increase the bat...",2024-05-16T11:30:49.711009,U066Q9JAU3B,Going back and forth on the best approach for ...,0a76a334-c


In [5]:
grouped_df = df.groupby(['topic', 'main_thread_id'])['creator'].count().reset_index(name='nmessages_in_topic').sort_values(['nmessages_in_topic'],ascending=False)


stage_grouping = {'$group': {
    '_id': '$main_thread_id',
    'topic': {'$addToSet': '$topic'},
    'topic_messages': {'$push': '$content'},
}}

stage_sorting = {
    "$sort": { "main_thread_id": DESCENDING}
}

pipeline = [
    # stage_matching_ids,
    stage_sorting,
    # stage_projecting
    stage_grouping,
]

result = list(db[collection_name].aggregate(pipeline))
grouped_df = pd.DataFrame(list(result))
grouped_df['topic'] = [topic[0] for topic in grouped_df['topic']]
grouped_df['nmessages'] = grouped_df['topic_messages'].apply(lambda lst: len(lst))
grouped_df['topic_messages'] = grouped_df['topic_messages'].apply(lambda lst: '-' + '\n- '.join(lst))
grouped_df['nchars'] = grouped_df['topic_messages'].apply(lambda string: len(string))
grouped_df['ntokens'] = grouped_df['topic_messages'].apply(lambda string: len(tokenizer.encode(string)))
print(f"Average nchars per token is {grouped_df['nchars'].sum() / grouped_df['ntokens'].sum():.2f}")
print(f"Average ntokens per message is {grouped_df['ntokens'].sum() / grouped_df['nmessages'].sum():.2f}")

grouped_df.sort_values(['nmessages'],ascending=False, inplace=True)
grouped_df



# grouped_df['messages'] = []

Average nchars per token is 3.26
Average ntokens per message is 92.19


Unnamed: 0,_id,topic,topic_messages,nmessages,nchars,ntokens
37,074eb056-4,Testing and Integration of REST API,-Hi <@U066Q9JAU3B>. I sent you access to the G...,16,4817,1365
35,0bae991d-c,Discussion about project results and feedback,-hmm.. I am trying to wrap my head around this...,14,3581,920
4,1ca8322f-8,Googledocs and model deployment,"-Hello, here are some instructions: <https://d...",12,2748,757
18,acc7439c-b,Project Kickoff and Preparation,-Hi <@U066Q9JAU3B>\nNice to see you at our pro...,12,1749,582
32,3fcfc0ff-7,Discussion on models and server options,-ok. I will start with mistral first. If neede...,11,3394,988
3,e0c25915-0,Discussion on clustering algorithm and features,-<@U05B5FXLP62> you also talked about 'mention...,10,1692,455
17,48c47841-b,Discussion about Googles Meet link and code sh...,-in 10 minutes yes.\n- Hi <@U066Q9JAU3B> and <...,10,1324,524
21,66634bc9-a,Discussion about experimenting with AI models ...,-Hi <@U06RU2JKMDG> <@U05B5FXLP62> I did some e...,9,3045,860
1,cd486d1b-6,Project Development,"-Let me know if you face any issues.\n- Sure, ...",9,3653,1112
19,14c19771-4,Project Update REST API and development progress,"-Hi <@U066Q9JAU3B>, today we're redoing the RE...",9,10129,4107


In [25]:
df.groupby(['topic']).groups
# .count().reset_index(name='nmessages_in_topic').sort_values(['nmessages_in_topic'],ascending=False)

{'Discussion about Googles Meet link and code sharing': [90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'Discussion about additional metrics for evaluating similarity of messages': [103], 'Discussion about experimenting with AI models and scheduling a meeting': [51, 52, 54, 56, 59, 60, 67, 68, 69], 'Discussion about project results and feedback': [20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 33, 34, 36], 'Discussion on LLM and BERTopic performance': [80, 81, 82, 84, 85, 87, 88, 89], 'Discussion on Model Deployment and Evaluation': [101, 102, 104, 105, 106, 107, 108, 109], 'Discussion on clustering algorithm and features': [40, 41, 42, 43, 44, 45, 47, 48, 49, 50], 'Discussion on models and server options': [227, 228, 229, 230, 234, 235, 236, 237, 238, 239, 249], 'GitHub and MongoDB access': [189, 192, 198, 202, 203], 'Go programming and hybrid search': [220, 223, 224, 225], 'Going back and forth on the best approach for handling machine learning models': [240, 242, 243, 244, 245, 246, 247, 248

In [11]:
df

Unnamed: 0,_id,content,created_at,creator,topic,main_thread_id
0,c3b41f3c-692d-4a27-9d98-04b1fa0919d0,Hi <@U066Q9JAU3B>\nNice to see you at our proj...,2024-04-11T07:49:45.714149,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
1,bfa16fbb-2201-4863-abfa-6cc9ec25ce33,"Hi <@UU43NJY8K>,\nThanks for inviting us.",2024-04-11T08:22:09.374109,U066Q9JAU3B,Project Kickoff and Preparation,acc7439c-b
2,21e66f4a-fbfc-4bc7-91d6-cc54ec427d57,Hi <@U066Q9JAU3B> <@U06855K24SE>\nDon't you mi...,2024-04-11T08:59:32.370529,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
3,effa84d1-cb39-4ea5-937d-95ea328afe5c,Works for me.,2024-04-11T09:02:28.000959,U066Q9JAU3B,Project Kickoff and Preparation,acc7439c-b
4,7e92acd1-4103-49d0-97db-1e4ebd891e69,"Hi Yogesh,\nPlease do not hesitate to share al...",2024-04-12T08:04:20.532329,UU43NJY8K,Project Kickoff and Preparation,acc7439c-b
...,...,...,...,...,...,...
245,2348069f-592d-4de9-94ee-136cee9b9907,"No, currently there is no overlap (I need to a...",2024-05-16T07:33:00.645309,U05R0KHPU8N,Going back and forth on the best approach for ...,0a76a334-c
246,6363572d-f176-4851-8592-d46b29c4f6ae,"without an overlapping window, we have no way ...",2024-05-16T11:24:58.207359,U066Q9JAU3B,Going back and forth on the best approach for ...,0a76a334-c
247,da42b289-45b7-4e06-88e6-7b469a362188,"<@U066Q9JAU3B> Indeed, my recent experiments s...",2024-05-16T11:27:03.332109,U05R0KHPU8N,Going back and forth on the best approach for ...,0a76a334-c
248,ce942fb1-0179-4655-8d38-f9f41475af6d,"with the overlap, we can even increase the bat...",2024-05-16T11:30:49.711009,U066Q9JAU3B,Going back and forth on the best approach for ...,0a76a334-c
