In [None]:
import pandas as pd
import re
# Visualization
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px
import plotly.io as pio
from collections import Counter
from wordcloud import WordCloud
init_notebook_mode(True)
from wordcloud import WordCloud

In [None]:
def parse_file(text_file):
    '''Convert WhatsApp chat log text file to a Pandas dataframe.'''
   
    # Define different date formats that can be found in the text file
    split_formats = [
        '\n(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2} [APap][mM]) -',  # Format: MM/DD/YY, HH:MM AM/PM
        '\n(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}\u202f[APap][mM]) -',  # Format: MM/DD/YY, HH:MM AM/PM with unicode character \u202f
        '\n(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}) -',  # Format: MM/DD/YY, HH:MM
    ]

    # Try to split the text file with each date format
    for fmt in split_formats:
        parsed_data = re.split(fmt, text_file)
        # If the text file is successfully split, break the loop
        if len(parsed_data) != 1:
            break

    # Remove leading and trailing spaces from each piece of parsed data
    parsed_data = [data.strip() for data in parsed_data]

    # Pair up the parsed data (timestamp with author) and convert it to a dataframe
    message_list = list(zip(parsed_data[1::2], parsed_data[2::2]))
    message_df = pd.DataFrame(message_list, columns=['timestamp', 'users'])

    # Convert the 'Timestamp' column to datetime format
    message_df['timestamp'] = pd.to_datetime(message_df['timestamp'], format='%m/%d/%y, %I:%M\u202f%p')

    # Split the 'users' column into 'users' and 'chat' at the first colon
    # Do not remove other characters after the second or third colon
    message_df[['users', 'chats']] = message_df['users'].apply(lambda x: pd.Series(str(x).split(':', 1)))
    #message_df['chat'] = message_df['chat'].replace(' <Media omitted>', 'sent a media file')
    message_df['users'] = message_df['users'].fillna('')
    message_df['chats'] = message_df['chats'].fillna('')
    message_df.to_csv("../data/SS_chat.csv",index=False)
    return message_df

In [None]:
# Load your file here
with open('../data/SS.txt', 'r') as file:
    file_data = file.read()

df = parse_file(file_data)
df.head()

#### EDA

In [None]:
# Define regex pattern for exclusion
regex_pattern = r'\w*\s*added .*|\w*\s*created group|Your security code with.*'

# Apply regex condition to exclude rows
viz_df = df[~df['users'].str.contains(regex_pattern, regex=True, case=False)]
viz_df.to_csv("../data/SS_viz_data.csv",index=False)

In [None]:
# Regular expression to match URLs
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# Function to count URLs in a text
def count_urls(text):
    urls = re.findall(url_pattern, text)
    return len(urls)


In [None]:
import plotly.express as px

def visualize_data(df):
    # 1. Which author has the highest message count?
    message_count = df['users'].value_counts().reset_index()
    message_count.columns = ['users', 'Count']
    fig = px.bar(message_count, x='users', y='Count', title='Who chatted the most')
    fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',title={"font":{"size":15}},
   
    )
    fig.update_yaxes(showgrid=True, gridcolor='#EFEFEF')
    pio.write_image(fig, '../data/message_count.png')
    fig.show()

    # 2. Which author has the highest message with a URL?
    df['URL_Count'] = df['chats'].apply(count_urls)
    url_count = df.groupby('users')['URL_Count'].sum().reset_index()
    fig = px.bar(url_count, x='users', y='URL_Count', title='Who sent more web links')
    fig.update_layout(title={"font":{"size":15}},
    plot_bgcolor='rgba(0,0,0,0)',
   
    )
    fig.update_yaxes(showgrid=True, gridcolor='#EFEFEF')
    pio.write_image(fig, '../data/message_url_count.png')

    fig.show()

    # 3. Which author has <Media omitted>?
    df['Media_Count'] = df['chats'].apply(lambda x: 1 if '<Media omitted>' in x else 0)
    media_count = df.groupby('users')['Media_Count'].sum().reset_index()
    fig = px.bar(media_count, x='users', y='Media_Count', title='Who sent more media files')
    fig.update_layout(title={"font":{"size":15}},
    plot_bgcolor='rgba(0,0,0,0)',
  
    )
    fig.update_yaxes(showgrid=True, gridcolor='#EFEFEF')
    pio.write_image(fig, '../data/media_count.png')
    fig.show()

    # 4. Ratio of <Media omitted> to other text
    total_messages = df['chats'].count()
    media_messages = df['Media_Count'].sum()
    data = {'Type': ['<Media omitted>', 'Other Text'], 'Count': [media_messages, total_messages - media_messages]}
    df_ratio = pd.DataFrame(data)
    fig = px.pie(df_ratio, values='Count', names='Type', title='How did we chat')
    fig.update_traces(textposition='outside', textinfo='percent',)
    pio.write_image(fig, '../data/ratio_pie_chart.png')
    fig.show()

    #.5 What date did we chat the hightest
    date_counts = viz_df['timestamp'].dt.strftime('%Y-%m-%d').value_counts()
    
    # Sort by the date
    date_counts = date_counts.sort_index()
    
    # Plot the time series using plotly with smooth curves
    fig = go.Figure(data=go.Scatter(x=date_counts.index, y=date_counts.values, mode='lines+markers', line=dict(shape='spline')))
    
    fig.update_yaxes(showgrid=True, gridcolor='#EFEFEF')
    fig.update_layout(title='Which date did we chat the most', xaxis_title='Date',
                      plot_bgcolor='rgba(0,0,0,0)', yaxis_title='Count',
                       autosize=False,
                       width=1635, # width in pixels, 16.35cm
                       height=514, # height in pixels, 5.14cm
                      yaxis=dict(range=[0, 50], tickmode='linear', tick0=0, dtick=10),
                      xaxis=dict(tickmode='array', tickvals=date_counts.index, ticktext=date_counts.index, tickangle=90))
    pio.write_image(fig, '../data/ts_chat_highest.png')
    fig.show()

# Call the function with your dataframe
visualize_data(viz_df)

In [None]:
# Extract the minute from the datetime column and count the occurrences
time_counts = viz_df['timestamp'].dt.minute.value_counts()

# Sort by the minute of the day
time_counts = time_counts.sort_index()

# Plot the time series using plotly with smooth curves
fig = go.Figure(data=go.Scatter(x=time_counts.index, y=time_counts.values, mode='lines+markers', line=dict(shape='spline')))

fig.update_yaxes(showgrid=True, gridcolor='#EFEFEF')
fig.update_layout(title='How did we chat everyday', xaxis_title='Minute of the Hour',
                  plot_bgcolor='rgba(0,0,0,0)', yaxis_title='Count',
                   autosize=True,
                    height=350,
                  yaxis=dict(range=[0, 14], tickmode='linear', tick0=0, dtick=2),
                  xaxis=dict(tickmode='linear', tick0=0, dtick=5))
pio.write_image(fig, '../data/ts_chat_everyday.png')
fig.show()

In [None]:
date_counts.index

1. which word occurred the most in Messages column?
2. from authors column, who sent that message?
3. use wordcous to represent item 1
4. search for all occurence of item 1 and save the message and author in a dataframe

##### # which word occurred the most in Messages column?

In [None]:

words = ' '.join(viz_df['chats']).split()
word_count_data=list(Counter(words).items())
word_count_df = pd.DataFrame(word_count_data, columns=['Word', 'Count'])
word_count_df

In [None]:
# Find the index of the row with the highest count
max_count_index = word_count_df['Count'].idxmax()
lg_count=word_count_df['Count'].nlargest(5)
lg_count

In [None]:
max_count_index = 0

In [None]:
# Get the word and count with the highest count
word_with_highest_count = word_count_df.loc[max_count_index, 'Word']
count_with_highest_count = word_count_df.loc[max_count_index, 'Count']

# Display the word and count with the highest count
print(f"Word with the highest count: {word_with_highest_count}, Count: {count_with_highest_count}")

##### from authors column, who sent that message?

In [None]:
# Find the author who sent the message containing the word with the highest count
author_of_highest_count_word = df.loc[df['chats'].str.contains(word_with_highest_count, case=False, regex=True), 'users'].iloc[0]
author_of_highest_count_word

##### use wordcous to represent item 1

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(words))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig("../data/wordcloud.png")


In [None]:
df_common_word = viz_df[viz_df['chats'].str.contains(word_with_highest_count)][['chats', 'users']]
df_common_word