diff --git a/Whatsapp Chats Sentiment Analysis/README.md b/Whatsapp Chats Sentiment Analysis/README.md new file mode 100644 index 00000000..fc34ee8c --- /dev/null +++ b/Whatsapp Chats Sentiment Analysis/README.md @@ -0,0 +1,55 @@ +# WhatsApp Chats Sentiment Analysis + +## Overview +This project performs sentiment analysis on WhatsApp chat data using Natural Language Processing (NLP) techniques. The implementation extracts chat messages from a WhatsApp export file, processes the text data, and analyzes sentiment patterns using VADER (Valence Aware Dictionary and sentiment Reasoner). + +## Features +- WhatsApp chat file parsing and message extraction +- Multiline message handling +- Sentiment analysis using NLTK's VADER +- Visualization of sentiment distribution (Positive, Negative, Neutral) + + +## Technologies Used +- Python 3 +- Pandas & NumPy for data manipulation +- NLTK for sentiment analysis +- Matplotlib & Seaborn for visualization +- Emoji library for special character handling +- Regular expressions for text parsing + +## Installation +1. Clone this repository +2. Install required packages: + + +## Usage +1. Export your WhatsApp chat as a .txt file (without media) +2. Place the file in the project directory +3. Update the file path in the notebook: +```python +conversation = r"your_whatsapp_chat.txt" +``` +4. Run the Jupyter notebook cells sequentially + +## Code Structure +1. **Data Extraction**: Parses WhatsApp chat format and extracts messages with metadata +2. **Data Cleaning**: Handles multiline messages and missing values +3. **Sentiment Analysis**: Uses VADER to compute positive, negative, and neutral scores +4. **Visualization**: Generates bar charts showing sentiment distribution + +## Results +The analysis provides: +- Average sentiment scores for the conversation +- Visual representation of sentiment distribution +- Message-level sentiment scoring + +## Note +- The implementation handles WhatsApp's specific date-time format and message structure +- Supports both 12-hour time formats +- Properly processes messages with emojis and special characters +- Maintains message context across line breaks + +## Privacy +This tool processes chat data locally. No data is sent to external servers, ensuring your conversations remain private. + diff --git a/Whatsapp Chats Sentiment Analysis/Screenshot 2025-08-24 195928.png b/Whatsapp Chats Sentiment Analysis/Screenshot 2025-08-24 195928.png new file mode 100644 index 00000000..c74faf88 Binary files /dev/null and b/Whatsapp Chats Sentiment Analysis/Screenshot 2025-08-24 195928.png differ diff --git a/Whatsapp Chats Sentiment Analysis/Whatsapp Chats Sentiment Analysis.ipynb b/Whatsapp Chats Sentiment Analysis/Whatsapp Chats Sentiment Analysis.ipynb new file mode 100644 index 00000000..bb8c5086 --- /dev/null +++ b/Whatsapp Chats Sentiment Analysis/Whatsapp Chats Sentiment Analysis.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "571ba0f0-de49-4f1d-b0c5-2627d9575bc1", + "metadata": {}, + "source": [ + "**Implementation**" + ] + }, + { + "cell_type": "markdown", + "id": "4a14af81-8abc-41d1-8253-a0d4cebf8025", + "metadata": {}, + "source": [ + "Importing the required libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91a3c082-7476-4b12-baf5-6172999ef0af", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install emoji\n", + "!pip install wordcloud\n", + "import re\n", + "import pandas as pd\n", + "import numpy as np\n", + "import emoji\n", + "from collections import Counter\n", + "import matplotlib.pyplot as plt\n", + "from PIL import Image\n", + "from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n", + "import nltk\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "import seaborn as sns\n" + ] + }, + { + "cell_type": "markdown", + "id": "cfd7181c-20b3-4605-89cc-67a05d3e1bd5", + "metadata": {}, + "source": [ + "Define the File Path & Open and Read the File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3b74922-04f6-4526-a9bf-049d8b5a3845", + "metadata": {}, + "outputs": [], + "source": [ + "conversation = r\"whatspp group chat txt file.txt\"\n", + "\n", + "with open(conversation, \"r\", encoding=\"utf-8\") as file:\n", + " lines = file.readlines()\n", + "\n", + "print(f\"Total lines in chat file: {len(lines)}\")\n", + "print(\"\\nFirst 10 lines from the file:\")\n", + "for i in range(min(10, len(lines))):\n", + " print(lines[i].strip())" + ] + }, + { + "cell_type": "markdown", + "id": "5de5da7c-b79c-40a0-b7db-18f361398e47", + "metadata": {}, + "source": [ + "Identification of whether a line from a WhatsApp chat file starts with a timestamp." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28df0be9-e25c-479a-9785-a60a05411833", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def date_time(s):\n", + " pattern = r'^(\\d{1,2})/(\\d{1,2})/(\\d{2,4}), (\\d{1,2}):(\\d{2}) ?(AM|PM|am|pm)? -'\n", + " return bool(re.match(pattern, s))\n", + "\n", + "# Test on first 10 lines from the chat file\n", + "for line in lines[:10]:\n", + " print(f\"{line.strip()} → {date_time(line)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bf78892-9c78-4855-8c7a-7f077109439b", + "metadata": {}, + "outputs": [], + "source": [ + "def getMessage(line):\n", + " if \" - \" not in line:\n", + " return None, None, None, None # Skip invalid lines\n", + "\n", + " splitline = line.split(\" - \", 1)\n", + " datetime_part = splitline[0]\n", + " \n", + " try:\n", + " date, time = datetime_part.split(\", \", 1)\n", + " except ValueError:\n", + " return None, None, None, None # Skip invalid lines\n", + " \n", + " message_part = splitline[1]\n", + " if \": \" in message_part:\n", + " author, message = message_part.split(\": \", 1)\n", + " else:\n", + " author, message = None, message_part # No contact name found\n", + " \n", + " return date, time, author, message" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a12a35bf-1127-4c14-8598-668cc6e462d8", + "metadata": {}, + "outputs": [], + "source": [ + "for line in lines[:10]: \n", + " print(getMessage(line))" + ] + }, + { + "cell_type": "markdown", + "id": "3cd59572-ffe0-46db-98fb-7636f66a7f44", + "metadata": {}, + "source": [ + "Extracts structured message data from a WhatsApp chat file and stores it in a list. It correctly handles multiline messages, ensuring they are grouped with their respective timestamps and authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42224c23-b168-4c6e-9576-657eb09f2b62", + "metadata": {}, + "outputs": [], + "source": [ + "data = []\n", + "messageBuffer = []\n", + "date, time, author = None, None, None\n", + "\n", + "for line in lines:\n", + " line = line.strip()\n", + " if not line:\n", + " continue # Skip empty lines\n", + "\n", + " if date_time(line): # If it's a new message\n", + " if messageBuffer:\n", + " data.append([date, time, author, ' '.join(messageBuffer)])\n", + " messageBuffer.clear()\n", + " date, time, author, message = getMessage(line)\n", + " messageBuffer.append(message)\n", + " else:\n", + " messageBuffer.append(line) # Append multiline messages\n", + "\n", + "if messageBuffer:\n", + " data.append([date, time, author, ' '.join(messageBuffer)])\n", + "\n", + "print(f\"Total messages extracted: {len(data)}\")\n", + "print(data[:5]) # Show first 5 extracted messages\n" + ] + }, + { + "cell_type": "markdown", + "id": "9d029402-54c2-477f-bda8-976a543413a3", + "metadata": {}, + "source": [ + "Sentiment of WhatsApp chat messages using NLTK's VADER Sentiment Analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "684780ea-a33c-4905-b5ca-769fa7bc427b", + "metadata": {}, + "outputs": [], + "source": [ + "#Convert Extracted Data into a Pandas DataFrame\n", + "df = pd.DataFrame(data, columns=[\"Date\", \"Time\", \"Contact\", \"Message\"])\n", + "\n", + "#Ensure Data is Clean\n", + "if df.empty:\n", + " print(\"No messages extracted. Fix chat parsing first.\")\n", + "else:\n", + " df['Date'] = pd.to_datetime(df['Date'])\n", + " df.dropna(inplace=True)\n", + "\n", + "\n", + "# Initialize Sentiment Analyzer\n", + "sentiments = SentimentIntensityAnalyzer()\n", + "\n", + "# Apply Sentiment Analysis\n", + "df[\"Positive\"] = df[\"Message\"].astype(str).apply(lambda x: sentiments.polarity_scores(x)[\"pos\"])\n", + "df[\"Negative\"] = df[\"Message\"].astype(str).apply(lambda x: sentiments.polarity_scores(x)[\"neg\"])\n", + "df[\"Neutral\"] = df[\"Message\"].astype(str).apply(lambda x: sentiments.polarity_scores(x)[\"neu\"])\n", + "\n", + "# Display first 5 messages\n", + "pd.set_option('display.width', 200) # Adjust width for better formatting\n", + "print(df.head(25).to_string(index=False))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "619059af-5f11-4d21-a122-dc13ef1381bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Sentiment Visualization (Positive, Neutral, Negative Messages)\n", + "plt.figure(figsize=(10, 5))\n", + "sentiment_counts = df[[\"Positive\", \"Negative\", \"Neutral\"]].mean()\n", + "sentiment_counts.plot(kind=\"bar\", color=[\"green\", \"red\", \"blue\"])\n", + "plt.title(\"Sentiment Analysis of Chat Messages\")\n", + "plt.ylabel(\"Average Sentiment Score\")\n", + "plt.xticks(rotation=0)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}