**Implementation**

Importing the required libraries

In [None]:
!pip install emoji
!pip install wordcloud
import re
import pandas as pd
import numpy as np
import emoji
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns


Define the File Path & Open and Read the File

In [None]:
conversation = r"whatspp group chat txt file.txt"

with open(conversation, "r", encoding="utf-8") as file:
    lines = file.readlines()

print(f"Total lines in chat file: {len(lines)}")
print("\nFirst 10 lines from the file:")
for i in range(min(10, len(lines))):
    print(lines[i].strip())

Identification of whether a line from a WhatsApp chat file starts with a timestamp.

In [None]:
import re

def date_time(s):
    pattern = r'^(\d{1,2})/(\d{1,2})/(\d{2,4}), (\d{1,2}):(\d{2}) ?(AM|PM|am|pm)? -'
    return bool(re.match(pattern, s))

# Test on first 10 lines from the chat file
for line in lines[:10]:
    print(f"{line.strip()} → {date_time(line)}")

In [None]:
def getMessage(line):
    if " - " not in line:
        return None, None, None, None  # Skip invalid lines

    splitline = line.split(" - ", 1)
    datetime_part = splitline[0]
    
    try:
        date, time = datetime_part.split(", ", 1)
    except ValueError:
        return None, None, None, None  # Skip invalid lines
    
    message_part = splitline[1]
    if ": " in message_part:
        author, message = message_part.split(": ", 1)
    else:
        author, message = None, message_part  # No contact name found
    
    return date, time, author, message

In [None]:
for line in lines[:10]:  
    print(getMessage(line))

Extracts structured message data from a WhatsApp chat file and stores it in a list. It correctly handles multiline messages, ensuring they are grouped with their respective timestamps and authors.

In [None]:
data = []
messageBuffer = []
date, time, author = None, None, None

for line in lines:
    line = line.strip()
    if not line:
        continue  # Skip empty lines

    if date_time(line):  # If it's a new message
        if messageBuffer:
            data.append([date, time, author, ' '.join(messageBuffer)])
        messageBuffer.clear()
        date, time, author, message = getMessage(line)
        messageBuffer.append(message)
    else:
        messageBuffer.append(line)  # Append multiline messages

if messageBuffer:
    data.append([date, time, author, ' '.join(messageBuffer)])

print(f"Total messages extracted: {len(data)}")
print(data[:5])  # Show first 5 extracted messages


Sentiment of WhatsApp chat messages using NLTK's VADER Sentiment Analysis.

In [None]:
#Convert Extracted Data into a Pandas DataFrame
df = pd.DataFrame(data, columns=["Date", "Time", "Contact", "Message"])

#Ensure Data is Clean
if df.empty:
    print("No messages extracted. Fix chat parsing first.")
else:
    df['Date'] = pd.to_datetime(df['Date'])
    df.dropna(inplace=True)


# Initialize Sentiment Analyzer
sentiments = SentimentIntensityAnalyzer()

# Apply Sentiment Analysis
df["Positive"] = df["Message"].astype(str).apply(lambda x: sentiments.polarity_scores(x)["pos"])
df["Negative"] = df["Message"].astype(str).apply(lambda x: sentiments.polarity_scores(x)["neg"])
df["Neutral"] = df["Message"].astype(str).apply(lambda x: sentiments.polarity_scores(x)["neu"])

# Display first 5 messages
pd.set_option('display.width', 200)  # Adjust width for better formatting
print(df.head(25).to_string(index=False))



In [None]:
# Sentiment Visualization (Positive, Neutral, Negative Messages)
plt.figure(figsize=(10, 5))
sentiment_counts = df[["Positive", "Negative", "Neutral"]].mean()
sentiment_counts.plot(kind="bar", color=["green", "red", "blue"])
plt.title("Sentiment Analysis of Chat Messages")
plt.ylabel("Average Sentiment Score")
plt.xticks(rotation=0)
plt.show()