In [1]:
import pandas as pd

In [2]:
# Read the CSV file
csv_file_path = r"whatsapp.txt"

# Define lists to store data
dates = []
times = []
users = []
messages = []

# Open the file and read line by line
with open(csv_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Split the line into date, time, user, and message
        parts = line.strip().split(' - ')
        if len(parts) == 2:
            date_time = parts[0].split(', ')
            user_message = parts[1].split(': ')

            # Extract date, time, user, and message
            if len(date_time) == 2 and len(user_message) == 2:
                dates.append(date_time[0])
                times.append(date_time[1])
                users.append(user_message[0])
                messages.append(user_message[1])

FileNotFoundError: [Errno 2] No such file or directory: 'whatsapp.txt'

In [None]:
# Create DataFrame
df = pd.DataFrame({
    'Date': dates,
    'Time': times,
    'User': users,
    'Message': messages
})

print(df.head())

In [None]:
#Total Messages
df.shape[0]

In [None]:
# Initialize an empty list to store words
words = []

# Iterate through each message in the 'Message' column of the DataFrame
for message in df['Message']:
    # Split the message into words and extend the 'words' list
    words.extend(message.split())

# Print the total number of words
print(len(words))


In [None]:
# Check for exact match
df[df['Message'].str.strip() == '<Media omitted>'].shape[0]


In [None]:
#Number of Links Shared
from urlextract import URLExtract

In [None]:
extract = URLExtract()

links = []
for message in df['Message']:
    links.extend(extract.find_urls(message))

print(len(links))

In [None]:
import matplotlib.pyplot as plt

# Get the top 5 users with the highest message count
top_users = df['User'].value_counts().head()

# Extract user names and message counts
user_names = top_users.index
msg_count = top_users.values

# Create the pie chart
plt.figure(figsize=(4, 4))  # Adjust figure size as needed
patches, texts, autotexts = plt.pie(msg_count, labels=user_names, autopct='%1.1f%%', startangle=140, colors=plt.cm.tab20.colors)
plt.title('Message Distribution by Users')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Add count labels to the pie chart
for i, text in enumerate(autotexts):
    text.set_text(f'{msg_count[i]} ({autotexts[i].get_text()})')

plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

# Assuming df is the DataFrame containing the chat data

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Filter data up to the current month
current_date = datetime.now()
current_year = current_date.year
current_month = current_date.month
df = df[(df['Date'].dt.year < current_year) | ((df['Date'].dt.year == current_year) & (df['Date'].dt.month <= current_month))]

# Extract year, month number, and month from the 'Date' column
df['year'] = df['Date'].dt.year
df['month_num'] = df['Date'].dt.month
df['month'] = df['Date'].dt.strftime('%b')  # Abbreviated month name (e.g., Jan, Feb, etc.)

# Grouping by year, month_num, and month and counting the number of messages
timeline = df.groupby(['year', 'month_num', 'month']).count()['Message'].reset_index()

# Creating a new column 'time' combining month and year
timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str)

# Generate the line chart
plt.figure(figsize=(12,6))
plt.plot(timeline['time'], timeline['Message'])
plt.xticks(rotation='vertical')

# Annotate all points with their respective values
for i, row in timeline.iterrows():
    plt.annotate(f"{row['Message']}", xy=(row['time'], row['Message']), xytext=(0, 5), textcoords='offset points', ha='center')

# Write the report
report = 'Line Chart Report\n\n'
report += 'Top 10 High Counts\n'
top_10_high = timeline.nlargest(10, 'Message')
for i, row in top_10_high.iterrows():
    report += f"{row['time']}\t{row['Message']} - Peak Messages\n"

report += '\nTop 10 Low Counts\n'
top_10_low = timeline.nsmallest(10, 'Message')
for i, row in top_10_low.iterrows():
    report += f"{row['time']}\t{row['Message']} - Low Messages\n"

# Add additional inferences or insights if needed
report += '\nInference:\n'
report += 'The line chart shows the trend of message counts over time. Peaks indicate periods of high message activity.\n'

# Print the report
print(report)

# Show the plot with accurate values
plt.tight_layout()
plt.show()



In [None]:
# Extract day of the week from the 'Date' column
df['day_of_week'] = df['Date'].dt.day_name()

# Plotting the bar chart for the busiest day
busy_day = df['day_of_week'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
bars = plt.bar(busy_day.index, busy_day.values, color='purple')
plt.title("Busiest Day")
plt.xlabel("Day of the Week")
plt.ylabel("Message Count")
plt.xticks(rotation='vertical')

# Annotate the count values on each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), va='bottom')

plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming 'Date' column contains datetime values
df['only_date'] = df['Date'].dt.date

# Now, you can proceed with grouping by 'only_date'
daily_timeline = df.groupby('only_date').count()['Message'].reset_index()

# Plotting the line graph
plt.figure(figsize=(12, 6))
plt.plot(daily_timeline['only_date'], daily_timeline['Message'])
plt.title('Daily Message Count Timeline')
plt.xlabel('Date')
plt.ylabel('Message Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Generating the report for peak message counts
report_peak = 'Top 10 Peak Message Counts\n\n'
report_peak += 'Date\t\tPeak Message Count\n'
top_peak = daily_timeline.nlargest(10, 'Message')
for _, row in top_peak.iterrows():
    formatted_date = row['only_date'].strftime('%d/%m/%Y')
    report_peak += f"{formatted_date}\t{row['Message']} (Messages)\n"

# Generating the report for lowest message counts
report_low = 'Top 10 Lowest Message Counts\n\n'
report_low += 'Date\t\tLowest Message Count\n'
top_low = daily_timeline.nsmallest(10, 'Message')
for _, row in top_low.iterrows():
    formatted_date = row['only_date'].strftime('%d/%m/%Y')
    report_low += f"{formatted_date}\t{row['Message']} (Messages)\n"

# Printing the reports
print(report_peak)
print(report_low)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'df' contains the DataFrame with WhatsApp chat data
# Convert 'Time' column to datetime format with specified format
df['Time'] = pd.to_datetime(df['Time'], format='%I:%M %p')

# Extract hour from the 'Time' column
df['Hour'] = df['Time'].dt.hour

# Group by hour and count the number of messages for each hour
hourly_activity = df.groupby('Hour').size()

# Plotting the trend of user activity by hour
plt.figure(figsize=(12, 6))
hourly_activity.plot(marker='o', color='blue')
plt.title('User Activity by Hour of the Day')
plt.xlabel('Hour of the Day (24-hour format)')
plt.ylabel('Number of Messages')
plt.xticks(range(0, 24))
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Extract month from the 'Date' column
df['month'] = df['Date'].dt.strftime('%B')  # Full month name (e.g., January, February, etc.)

# Plotting the bar chart for the busiest month
busy_month = df['month'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
bars = plt.bar(busy_month.index, busy_month.values, color='blue')
plt.title("Busiest Month")
plt.xlabel("Month")
plt.ylabel("Message Count")
plt.xticks(rotation='vertical')

# Annotate the count values on each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), va='bottom')

plt.show()


In [None]:
# Take input word from the user
input_word = input("Enter the word to search: ")

# Filter DataFrame based on the input word
filtered_df = df[df['Message'].str.contains(input_word, case=False)]

# Display the results
if filtered_df.empty:
    print("No messages found containing the input word.")
else:
    print("Messages containing the input word:")
    for index, row in filtered_df.iterrows():
        print(f"Date: {row['Date'].strftime('%d/%m/%Y')}, Time: {row['Time'].strftime('%H:%M:%S')}, User: {row['User']}, Message: {row['Message']}")

    # Count total messages of each user containing the input word
    total_messages = filtered_df.groupby('User').size().reset_index(name='Total Messages')
    print("\nTotal messages of each user containing the input word:")
    print(total_messages)
