In [2]:
import re   # regular expression
import pandas as pd     # data processing, CSV file I/O (e.g. pd.read_csv)

In [4]:
data = open('chats/Chat1.txt', 'r', encoding='utf-8').read()    # read the data
data    # print the data

"05/10/23, 12:34\u202fpm - Rhea Sharma: Hey Neha! How's it going?\n\n05/10/23, 12:35\u202fpm - Neha Gupta: Hey Rhea! I'm good. Just working today. How about you? Any plans for the long weekend?\n\n05/10/23, 12:36\u202fpm - Rhea Sharma: Nothing much planned yet. We should do something fun though! Are you free?\n\n05/10/23, 12:37\u202fpm - Neha Gupta: Yes I'm free! We definitely should plan a little getaway trip somewhere. Any ideas?\n\n05/10/23, 12:39\u202fpm - Rhea Sharma: Hmm maybe we could go to Goa? The beaches would be nice this time of year.\n\n05/10/23, 12:41\u202fpm - Neha Gupta: Goa sounds great! We could relax on the beach and party a little at night haha. When should we go?\n\n05/10/23, 12:42\u202fpm - Rhea Sharma: Let's go this weekend! I can look for flights and hotels tonight.\n\n05/10/23, 12:44\u202fpm - Neha Gupta: Amazing, book it! Can't wait for a girls trip, we need a vacation!\n\n05/10/23, 5:21\u202fpm - Rhea Sharma: Hey, I booked our flights and hotel! We're all set

In [5]:
pattern = '\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [ap]m - '       # pattern to find the date and time
messages = re.split(pattern, data)[1:]      # split the data based on the pattern
dates = re.findall(pattern, data)       # find all the dates and time

df = pd.DataFrame({'user_message':messages, 'message_date': dates})     # create a dataframe with messages and dates
# converting the message_date to date time format
df['message_date'] = pd.to_datetime(df['message_date'], format="%d/%m/%y, %I:%M %p - ")         
df.rename(columns={'message_date': 'date'}, inplace=True)       # rename the column

# separate users and messages
users = []      
messages = []       
for message in df['user_message']:      # loop through the messages
    entry = re.split('([\w\W]+?):\s', message)      # split the message based on the user
    if entry[1:]:       # if the message is not empty
        users.append(entry[1])      # append the user
        messages.append(entry[2])       # append the message
    else:
        users.append('whatsapp notification')       # else append the notification
        messages.append(entry[0])       # append the message

df['user'] = users      # create a column with users
df['message'] = messages        # create a column with messages
df.drop(columns=['user_message'], inplace=True)     # drop the user_message column

df['year'] = df['date'].dt.year     # create a column with year
df['month_num'] = df['date'].dt.month       # create a column with month number
df['only_date'] = df['date'].dt.date        # create a column with date
df['day_name'] = df['date'].dt.day_name()    # create a column with day name
df['month'] = df['date'].dt.month_name()    # create a column with month name
df['day'] = df['date'].dt.day     # create a column with day
df['hour'] = df['date'].dt.hour     # create a column with hour
df['minute'] = df['date'].dt.minute    # create a column with minute

period = []     # create an empty list
for hour in df[['day_name', 'hour']]['hour']:   # loop through the hour
    if hour == 23:      # if hour is 23
        period.append(str(hour) + "-" + str('00'))      # append 23-00
    elif hour == 0:     # if hour is 0
        period.append(str('00') + "-" + str(hour + 1))      # append 00-1
    else:       # else
        period.append(str(hour) + "-" + str(hour + 1))      # append hour-hour+1

df['period'] = period       # create a column with period

In [6]:
df

Unnamed: 0,date,user,message,year,month_num,only_date,day_name,month,day,hour,minute,period
0,2023-10-05 12:34:00,Rhea Sharma,Hey Neha! How's it going?\n\n,2023,10,2023-10-05,Thursday,October,5,12,34,12-13
1,2023-10-05 12:35:00,Neha Gupta,Hey Rhea! I'm good. Just working today. How ab...,2023,10,2023-10-05,Thursday,October,5,12,35,12-13
2,2023-10-05 12:36:00,Rhea Sharma,Nothing much planned yet. We should do somethi...,2023,10,2023-10-05,Thursday,October,5,12,36,12-13
3,2023-10-05 12:37:00,Neha Gupta,Yes I'm free! We definitely should plan a litt...,2023,10,2023-10-05,Thursday,October,5,12,37,12-13
4,2023-10-05 12:39:00,Rhea Sharma,Hmm maybe we could go to Goa? The beaches woul...,2023,10,2023-10-05,Thursday,October,5,12,39,12-13
...,...,...,...,...,...,...,...,...,...,...,...,...
137,2023-10-27 17:36:00,Rhea Sharma,"Yes! Final checks on Sunday, can't wait! Europ...",2023,10,2023-10-27,Friday,October,27,17,36,17-18
138,2023-10-29 11:21:00,Rhea Sharma,"ONE more day!! See you tonight, ready for the ...",2023,10,2023-10-29,Sunday,October,29,11,21,11-12
139,2023-10-29 11:23:00,Neha Gupta,AHHH it's actually happening!! Counting down t...,2023,10,2023-10-29,Sunday,October,29,11,23,11-12
140,2023-10-30 09:34:00,Rhea Sharma,WE MADE IT! Bonjour Paris! First day here and ...,2023,10,2023-10-30,Monday,October,30,9,34,9-10


In [7]:
import torch

In [8]:
# summarization of chats using bert
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM   # import the tokenizer and model

checkpoint = "sshleifer/distilbart-cnn-12-6"    # checkpoint

tokenizer = AutoTokenizer.from_pretrained(checkpoint)   # load the tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)       # load the model

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
tokenizer.model_max_length      # print the model max length

1024

In [10]:
tokenizer.max_len_single_sentence       # print the max length of single sentence

1022

In [11]:
#convert series to string
text = df['message'].str.cat(sep=' ')

In [12]:
import nltk    # import nltk
nltk.download('punkt')
sentences = nltk.sent_tokenize(text)

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


In [13]:
len(sentences)

364

In [14]:
length = 0     # initialize length
chunk = ""   # initialize chunk
chunks = []    # initialize chunks
count = -1   # initialize count

for sentence in sentences:  # loop through the sentences
    length += len(sentence)    # add the length of the sentence
    if length < 1024:   # if length is less than 1024
        chunk += sentence   # add the sentence to chunk
    else:
        chunks.append(chunk)    # append the chunk to chunks
        chunk = ""  # initialize chunk
        length = 0  # initialize length
        count += 1  # increment count
        chunk += sentence   # add the sentence to chunk

chunks.append(chunk)    # append the chunk to chunks

len(chunks)     # print the length of chunks

chunks

["Hey Neha!How's it going?Hey Rhea!I'm good.Just working today.How about you?Any plans for the long weekend?Nothing much planned yet.We should do something fun though!Are you free?Yes I'm free!We definitely should plan a little getaway trip somewhere.Any ideas?Hmm maybe we could go to Goa?The beaches would be nice this time of year.Goa sounds great!We could relax on the beach and party a little at night haha.When should we go?Let's go this weekend!I can look for flights and hotels tonight.Amazing, book it!Can't wait for a girls trip, we need a vacation!Hey, I booked our flights and hotel!We're all set for Goa this weekend.Yay!Thanks for booking everything.What time is our flight?And which hotel?Our flight is at 9am on Saturday morning.And the hotel is the Grand Hyatt in North Goa.Has great reviews!Sounds good!What should we pack?Beach clothes and party outfits I'm guessing?Yes, bikinis, cover ups, sundresses!And heels and cute tops for clubbing.Don't forget sunscreen!Got it.",
 "I'm so

In [15]:
# summarize the chunks
summaries = []
for chunk in chunks:
    input_ids = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True)   # encode the chunk
    output = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)    # generate the summary
    summary = tokenizer.decode(output[0], skip_special_tokens=True)    # decode the summary
    summaries.append(summary)   # append the summary

# join the summaries
summary = ' '.join(summaries)   # join the summaries

summary

" Neha and Rhea are planning a girls trip to Goa this weekend. Neha booked their flights and hotel and booked their flight to North Goa. Rhea is looking forward to a long weekend away with her and Neha. She says she and her friends need a girls' getaway somewhere.  Goa 2023 is going to be amazing! I'm so excited, we're going to have the best time! We could book a boat cruise one day to see the islands and maybe parasailing. We'll plan the activities when we get there. Let's meet Friday evening to pack?Sounds good! I'll come to your place after work.  The Goan fish curry and vindaloo was the highlight of our beach day. We can try a bunch of local dishes. Let's take it easy in the morning after all that food. Maybe a boat cruise at noon? Maybe we can pack a picnic lunch from the hotel.  What's left on our list before we leave?Hm I think we've done everything! Maybe a chill beach evening today?Cocktails and sunset?That sounds so nice! I'll meet you on the beach around 5?Perfect! Can't wai

### Training Basic Bert

In [16]:
from transformers import BertTokenizer  # import the tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # load the tokenizer

def load_dataset(file_path):
    with open(file_path, 'r') as file:  # open the file
        lines = file.readlines()    # read the lines
    tokenized_lines = [tokenizer.encode(line) for line in lines]        # tokenize the lines
    return tokenized_lines    # return the tokenized lines

In [19]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(    # create a dataset
    tokenizer=tokenizer,    # tokenizer
    file_path="D:\Documents\ML Projects\WhatsApp Chat Analyzer\chats\Chat1.txt",    # file path
    block_size=128,    # block size
)

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(    # create a data collator
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15   # tokenizer, mlm, mlm probability
)

In [21]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    vocab_size=tokenizer.vocab_size,    # vocab size
)

model = BertForMaskedLM(config=config)  # create a model

In [22]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(      # create training arguments
    output_dir="./bert_from_scratch",       # output directory
    overwrite_output_dir=True,      # overwrite the content of the output directory
    num_train_epochs=1,     # number of training epochs
    per_device_train_batch_size=64,     # batch size for training
    save_steps=10_000,      # number of steps before saving          
    save_total_limit=2,     # limit the total amount of checkpoints
)

trainer = Trainer(      # create a trainer
    model=model,        # the instantiated 🤗 Transformers model to be trained
    args=training_args,     # training arguments, defined above
    data_collator=data_collator,    # data collator
    train_dataset=dataset,      # training dataset
)

In [None]:
# from accelerate import Accelerator    # import the accelerator
# trainer.train()   # train the model
# trainer.save_model('./bert_from_scratch')  # save the model

100%|██████████| 114/114 [2:05:33<00:00, 66.08s/it] 


{'train_runtime': 7533.5675, 'train_samples_per_second': 0.964, 'train_steps_per_second': 0.015, 'train_loss': 5.8988486842105265, 'epoch': 1.0}


In [23]:
from transformers import pipeline

model = BertForMaskedLM.from_pretrained('./bert_from_scratch')      # load the model


fill_mask = pipeline(       # create a pipeline
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

In [24]:
fill_mask('What are we [MASK] to do?')      # predict the masked word

[{'score': 0.09875480830669403,
  'token': 1013,
  'token_str': '/',
  'sequence': 'what are we / to do?'},
 {'score': 0.011641875840723515,
  'token': 1024,
  'token_str': ':',
  'sequence': 'what are we : to do?'},
 {'score': 0.008726079948246479,
  'token': 1012,
  'token_str': '.',
  'sequence': 'what are we. to do?'},
 {'score': 0.005796629935503006,
  'token': 1010,
  'token_str': ',',
  'sequence': 'what are we, to do?'},
 {'score': 0.00354193733073771,
  'token': 2340,
  'token_str': '11',
  'sequence': 'what are we 11 to do?'}]

### Using Bart from Facebook

In [25]:
# Load a different model and tokenizer
new_checkpoint = "facebook/bart-large-cnn"

new_tokenizer = AutoTokenizer.from_pretrained(new_checkpoint)
new_model = AutoModelForSeq2SeqLM.from_pretrained(new_checkpoint)


Downloading pytorch_model.bin: 100%|██████████| 1.63G/1.63G [07:41<00:00, 3.52MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)neration_config.json: 100%|██████████| 363/363 [00:00<00:00, 97.8kB/s]


Neha and Rhea are planning a girls trip to Goa for the long weekend. Neha booked their flights and hotel and they are all set for Goa this weekend. The hotel is the Grand Hyatt in North Goa and the flight is at 9am on Saturday. I'm so excited, we're going to have the best time!Should we book any activities?Hm we could book a boat cruise one day to see the islands.And maybe parasailing!We'll plan the activities when we get there.Let's meet Friday evening to pack?Sounds good!I'll come to your place after work around 6.Woohoo Goa 2023 here we come!!See you Friday :) Omg that street food was incredible!I ate way too much haha Me too, that was such a fun experience!The live music was so good as well.What's the plan for tomorrow?Let's take it easy in the morning after all that food.Maybe a boat cruise at noon?Boat cruise sounds perfect!Maybe we can pack a picnic lunch from the hotel.I'm in!Let's do it!We have to try the Goan fish curry and vind Last night was wild!My feet are killing me from

In [26]:
# Tokenize and generate summaries using the new model
new_summaries = []

for chunk in chunks:
    input_ids = new_tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True)
    output = new_model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
    summary = new_tokenizer.decode(output[0], skip_special_tokens=True)
    new_summaries.append(summary)

# Join the summaries
new_summary = ' '.join(new_summaries)

In [27]:
print(new_summary)

Neha and Rhea are planning a girls trip to Goa for the long weekend. Neha booked their flights and hotel and they are all set for Goa this weekend. The hotel is the Grand Hyatt in North Goa and the flight is at 9am on Saturday. I'm so excited, we're going to have the best time!Should we book any activities?Hm we could book a boat cruise one day to see the islands.And maybe parasailing!We'll plan the activities when we get there.Let's meet Friday evening to pack?Sounds good!I'll come to your place after work around 6.Woohoo Goa 2023 here we come!!See you Friday :) Omg that street food was incredible!I ate way too much haha Me too, that was such a fun experience!The live music was so good as well.What's the plan for tomorrow?Let's take it easy in the morning after all that food.Maybe a boat cruise at noon?Boat cruise sounds perfect!Maybe we can pack a picnic lunch from the hotel.I'm in!Let's do it!We have to try the Goan fish curry and vind Last night was wild!My feet are killing me from