In [1]:
import re
import pandas as pd

def process_chat(file_path):
    chat_data = {}
    pattern = r"(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}\s?[APMapm]{2}) - ([^:]+): (.+)"

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            match = re.match(pattern, line)
            if match:
                timestamp, sender, message = match.groups()

                if message == "<Media omitted>":
                    continue

                if sender not in chat_data:
                    chat_data[sender] = message
                else:
                    chat_data[sender] += " " + message

    chat_df = pd.DataFrame(list(chat_data.items()), columns=['Name', 'Spoken Text'])

    return chat_df


In [3]:
seniors_chat=process_chat('/content/WhatsApp Chat with Seniors 🔱⚜️.txt')
seniors_chat

Unnamed: 0,Name,Spoken Text
0,Gautam IIT Patna,Hlo ? Dm aaya h 😅💀 Pakka ? Yrr ye sb touch nhi...
1,Anurag IITP,Link open kr Ya right Ha pooch ye kya hai Haa ...
2,Shubh,Abe madrchodod @919326760945 Wo glitch ho gya ...
3,AI,This message was deleted Here are various mean...
4,saurav BHAI IITP,Ka huwa ??? Naye bata nhi hai iske liye Bhaut ...


In [4]:
d6_chat=process_chat('/content/WhatsApp Chat with D6-004.txt')
d6_chat

Unnamed: 0,Name,Spoken Text
0,Gautam IIT Patna,Kon kon jinda h 💀 Haa lakin maira anurag name ...
1,Anurag IITP,Abhi aapke papa jinda hai To tension na le bet...
2,+91 63536 45277,betichod sab jinda hai .. @916202236461 bhai i...
3,Shubh,🥵 Bol very chod Mai maru ? 🫦 Video call kre ? ...
4,Ritu IITP,Madharchod gaand maar lenge Tu randi tohar kan...
5,+91 72092 87481,Gud morning sir 👀🌝 Bewda 🙃 Tuuu Randibaaz 😂 To...
6,Akshay IITP,Jiohhhhh bhai Dil khus krdiya Reply kya kiya b...
7,Shashi IITP,RRR Means Ritu Raj Randi Iss level ke madharch...
8,Akash IITP,Rituraj madharchod Shashi This message was del...
9,saurav BHAI IITP,RR randi Recorded dekho ga Ha 1.5x Pe Kp sir h...


In [8]:
hoore_chat=process_chat('/content/WhatsApp Chat with 72 hoore 2.0.txt')
hoore_chat

Unnamed: 0,Name,Spoken Text
0,Harry IITP,ladki patwa do yaar koi @919919900178 vai ladk...
1,+91 89870 80950,👀 Halwa hai kya bhai Maine kya Kiya bhai Ye as...
2,Aryan Vimro IITP,Paisa kamao vai Ek raand Teeno rakh lo Bandi k...
3,Shubh,😂 Kaha hai ? 😈 🙈 ise kahi to dekha hai .. Wo a...
4,Saaapaaaaraaaash IITP,Vai tu itta bada kaise ho gaya ✂️ @91879723431...
5,Meta AI,"Mujhe khed hai, par main aapko campus immersio..."
6,Ritu IITP,@13135550002 haan anuvad de? Beta gaand maar l...
7,Chandan IIT Patna,@13135550002 @916202236461 ke kitne baap Tmhar...
8,Akshay IITP,@918002007238 bhai tum bch ke rho I'd deta hu ...
9,Prachi Bro IITP,+1 Guys link bhejna us video ki 🫠 Bencho ye pd...


In [9]:
area_51_chat=process_chat('/content/WhatsApp Chat with Area 51 👽(only8).txt')
area_51_chat

Unnamed: 0,Name,Spoken Text
0,Rajeev IITP,Discussion mai aao Usmai bakchodi kartey hai S...
1,Alok Chaudhary iitp,Naa Woo khud bakchod group hai aur uska member...
2,Alok Bro IITP,Main group me pucho na class kb se start hoga ...
3,Aryan Aman IITP,Gaanja pukha hua h <This message was edited> H...
4,+91 62034 51610,https://youtube.com/shorts/vuiJmIA5CqA?si=N0Wl...
5,Omjee Bhai IITP,4 month kafi hai yaha 🙃 G@n.de do ushko <This ...
6,Gautam kumar Jha IITP,Saala null @917545968228 bhai Gandi maidan to ...
7,Shubh,🥺 He is no more Subah accident se death ho gai...
8,Meta AI,Here's a birthday message for the person: This...


In [10]:
def combine_chats(dfs):

    combined_df = pd.concat(dfs, ignore_index=True)

    combined_df = combined_df.groupby('Name', as_index=False).agg({'Spoken Text': ' '.join})

    return combined_df


In [11]:
dfs=[seniors_chat,d6_chat,hoore_chat,area_51_chat]
combined_df=combine_chats(dfs)
combined_df

Unnamed: 0,Name,Spoken Text
0,+91 62034 51610,https://youtube.com/shorts/vuiJmIA5CqA?si=N0Wl...
1,+91 63536 45277,betichod sab jinda hai .. @916202236461 bhai i...
2,+91 72092 87481,Gud morning sir 👀🌝 Bewda 🙃 Tuuu Randibaaz 😂 To...
3,+91 73929 96322,Hey bhagwan 🤦 Bhai kisi bhi session me bachodi...
4,+91 87576 31065,Ritu tere sath ye hona chahiye Ye XNXX HOTA PO...
5,+91 89870 80950,👀 Halwa hai kya bhai Maine kya Kiya bhai Ye as...
6,AI,This message was deleted Here are various mean...
7,Akash IITP,Rituraj madharchod Shashi This message was del...
8,Akshay IITP,Jiohhhhh bhai Dil khus krdiya Reply kya kiya b...
9,Alok Bro IITP,Main group me pucho na class kb se start hoga ...


In [12]:
new_names = {
    '+91 62034 51610': 'alok_area_51_wala',
    '+91 63536 45277': 'manish_iitp',
    '+91 72092 87481': 'samir_iitp',
    '+91 73929 96322': 'hridyanand',
    '+91 87576 31065': 'ayush_iitp',
    '+91 89870 80950': 'aryan_d6_wala'
}

combined_df['Name'] = combined_df['Name'].replace(new_names)

combined_df

Unnamed: 0,Name,Spoken Text
0,alok_area_51_wala,https://youtube.com/shorts/vuiJmIA5CqA?si=N0Wl...
1,manish_iitp,betichod sab jinda hai .. @916202236461 bhai i...
2,samir_iitp,Gud morning sir 👀🌝 Bewda 🙃 Tuuu Randibaaz 😂 To...
3,hridyanand,Hey bhagwan 🤦 Bhai kisi bhi session me bachodi...
4,ayush_iitp,Ritu tere sath ye hona chahiye Ye XNXX HOTA PO...
5,aryan_d6_wala,👀 Halwa hai kya bhai Maine kya Kiya bhai Ye as...
6,AI,This message was deleted Here are various mean...
7,Akash IITP,Rituraj madharchod Shashi This message was del...
8,Akshay IITP,Jiohhhhh bhai Dil khus krdiya Reply kya kiya b...
9,Alok Bro IITP,Main group me pucho na class kb se start hoga ...


In [13]:

combined_df.to_csv('chat_data.csv', index=False)

In [14]:
chat_data=pd.read_csv('/content/chat_data.csv')
chat_data

Unnamed: 0,Name,Spoken Text
0,alok_area_51_wala,https://youtube.com/shorts/vuiJmIA5CqA?si=N0Wl...
1,manish_iitp,betichod sab jinda hai .. @916202236461 bhai i...
2,samir_iitp,Gud morning sir 👀🌝 Bewda 🙃 Tuuu Randibaaz 😂 To...
3,hridyanand,Hey bhagwan 🤦 Bhai kisi bhi session me bachodi...
4,ayush_iitp,Ritu tere sath ye hona chahiye Ye XNXX HOTA PO...
5,aryan_d6_wala,👀 Halwa hai kya bhai Maine kya Kiya bhai Ye as...
6,AI,This message was deleted Here are various mean...
7,Akash IITP,Rituraj madharchod Shashi This message was del...
8,Akshay IITP,Jiohhhhh bhai Dil khus krdiya Reply kya kiya b...
9,Alok Bro IITP,Main group me pucho na class kb se start hoga ...


In [15]:

import pandas as pd

chat_data = pd.read_csv('/content/chat_data.csv')

chat_data = chat_data[chat_data['Spoken Text'].str.len() >= 5000]

chat_data['Spoken Text'] = chat_data['Spoken Text'].str[:40000]

chat_data.to_csv('updated_chat_data.csv', index=False)

print("Updated chat data saved to 'updated_chat_data.csv'")

Updated chat data saved to 'updated_chat_data.csv'


In [17]:
chat_data=pd.read_csv('/content/updated_chat_data.csv')
chat_data


Unnamed: 0,Name,Spoken Text
0,alok_area_51_wala,https://youtube.com/shorts/vuiJmIA5CqA?si=N0Wl...
1,manish_iitp,betichod sab jinda hai .. @916202236461 bhai i...
2,samir_iitp,Gud morning sir 👀🌝 Bewda 🙃 Tuuu Randibaaz 😂 To...
3,Akshay IITP,Jiohhhhh bhai Dil khus krdiya Reply kya kiya b...
4,Alok Bro IITP,Main group me pucho na class kb se start hoga ...
5,Alok Chaudhary iitp,Naa Woo khud bakchod group hai aur uska member...
6,Anurag IITP,Link open kr Ya right Ha pooch ye kya hai Haa ...
7,Aryan Aman IITP,Gaanja pukha hua h <This message was edited> H...
8,Aryan Vimro IITP,Paisa kamao vai Ek raand Teeno rakh lo Bandi k...
9,Gautam IIT Patna,Hlo ? Dm aaya h 😅💀 Pakka ? Yrr ye sb touch nhi...


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
cv=CountVectorizer(max_features=17,stop_words='english')
vectors=cv.fit_transform(chat_data['Spoken Text']).toarray()
similarity=cosine_similarity(vectors)


In [55]:
chat_data[chat_data['Name']=='Shubh'].index[0]

16

In [73]:
def check(name):
    index = chat_data[chat_data['Name'] == name].index[0]
    distances = sorted(
        list(enumerate(similarity[index])),reverse=True,key=lambda vector: vector[1])
    for i in distances[1:5]:
        matched_name = chat_data.iloc[i[0]].Name
        similarity_score = i[1]
        print(f"{matched_name}: {similarity_score:.2f}")


In [75]:
check('samir_iitp')

Shubh: 0.97
Anurag IITP: 0.95
saurav BHAI IITP: 0.95
Omjee Bhai IITP: 0.94


In [77]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def loader(chat):
  data=process_chat(chat)
  cv=CountVectorizer(max_features=len(data),stop_words='english')
  vectors=cv.fit_transform(data['Spoken Text']).toarray()
  similarity=cosine_similarity(vectors)

def check(name):
    index = chat_data[chat_data['Name'] == name].index[0]
    distances = sorted(
        list(enumerate(similarity[index])),reverse=True,key=lambda vector: vector[1])
    for i in distances[1:5]:
        matched_name = chat_data.iloc[i[0]].Name
        similarity_score = i[1]
        print(f"{matched_name}: {similarity_score:.2f}")
