In [99]:
import pandas as pd
import re   

In [101]:
chat_file = "WhatsApp Chat with Haddu.txt"

chat_messages=[]
users=[]
with open(chat_file, "r", encoding="utf-8") as file:
    chat_data = file.readlines()
    for line in chat_data:
        match = re.match(r"(\d{2}/\d{2}/\d{2}, \d{2}:\d{2}) - (.+?): (.+)", line)
        if match:
            sender = match.group(2)
            message = match.group(3)
            chat_messages.append({'Sender':sender,'Message':message})


data=pd.DataFrame(chat_messages)
data.head(10)

Unnamed: 0,Sender,Message
0,Haddu,Time vasthe cheppesthanuu
1,SINHA,Ha anthey
2,Haddu,Tq so much na Sodhi antha opika ga vinnanduku
3,Haddu,Suggestions kosam kuda
4,Haddu,Chala rojulu iyyindi ila evaritho iyna matladii
5,SINHA,😅 tnx to my playlist
6,Haddu,Evarina listener unte bagunnu anipisthundi e m...
7,Haddu,Nv dorikavv
8,Haddu,Balii
9,Haddu,🤣


Data Preprocessing

In [None]:
data["Message"]=data["Message"].str.lower()

data.to_csv("chat_data.csv",index=False)

In [None]:
data=pd.read_csv("chat_data.csv")
data

Unnamed: 0,Sender,Message
0,Sowmya,<media omitted>
1,Sowmya,😂🤣🤣🤣😂 idhe
2,SINHA,oh ok ardam aihindi
3,Sowmya,hmm
4,Sowmya,hlo
...,...,...
7548,SINHA,..
7549,SINHA,hi
7550,SINHA,you deleted this message
7551,SINHA,<media omitted>


In [None]:
data['Message']=data['Message'].astype(str)

Tokenizing Messages

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

tokenizer=Tokenizer(lower=True)
tokenizer.fit_on_texts(data['Message'])
X=tokenizer.texts_to_sequences(data['Message'])
X=pad_sequences(X)


Convert sender labels to numerical values

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()
y=label_encoder.fit_transform(data['Sender'])

Splitting the Dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

LSTM model

In [None]:
vocab_size = len(tokenizer.word_index) + 1
max_length = X.shape[1]

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Train the Model

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25cdb03b130>

Model Evaluation

In [None]:
loss,accuracy=model.evaluate(X_test,y_test)
print("Model Accuracy:",accuracy)

Model Accuracy: 0.8093977570533752


Saving Model 

In [None]:
import pickle
pickle.dump(model,open('./model1.sav','wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\embedding
......vars
.........0
...layers\lstm
......vars
...layers\lstm\cell
......vars
.........0
.........1
.........2
...metrics\mean
......vars
.........0
.........1
...metrics\mean_metric_wrapper
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-12-25 23:00:58         2210
metadata.json                                  2023-12-25 23:00:58           64
variables.h5                                   2023-12-25 23:00:58      7271480


Model Testing

In [None]:
import numpy as np

# New one-line message to test
new_message = "time vasthe cheppesthanu"

# Preprocess the new message using the same tokenizer
new_seq = tokenizer.texts_to_sequences([new_message])
new_pad = pad_sequences(new_seq, maxlen=max_length)

# Predict the sender of the new message
prediction = model.predict(new_pad)
pred_label=int(np.round(prediction)[0][0])

predicted_sender = label_encoder.inverse_transform([pred_label])[0]

print(f"Predicted Sender: {predicted_sender}")


Predicted Sender: Sowmya
