#### Imports

In [5]:
import os, re, string, emoji, yaml
import pandas as pd
import numpy as np
from urlextract import URLExtract
from langdetect import detect, detect_langs
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

from googletrans import Translator
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# nltk.download('stopwords')
# nltk.download('punkt')

#### Variables

In [6]:
cwd = os.getcwd()
project_dir = os.path.join(cwd, "..")
data_filepath = os.path.join(project_dir, "WhatsApp Chat with Ai Discussion Group.txt")
chat = ""
extracted_data_dir = insights_filepath = os.path.join(project_dir, "src", 'extracted-data')
insights_filepath = os.path.join(extracted_data_dir, 'insights.yaml')

urlextract = URLExtract()
translator = Translator()

#### Functions

In [18]:
class Utilities:
    def get_stop_words_list(self, ):
        stop_words = stopwords.words('english')
        roman_urdu = self.read_txtfile(os.path.join(project_dir, "data", "roman-urdu-stopwords.txt")).split('\n')
        # urdu = read_txtfile(os.path.join(project_dir, "data", "urdu-stopwords.txt")).split('\n')  # we're not using these words for now.
        stop_words.extend(roman_urdu)
        # stop_words.extend(urdu)
        return stop_words
utitlies = Utilities()

class WordsProcessor:
    def translate_urdu_words(self, words: list):
        for index, word in enumerate(words):
            print(f"processing word: {word}")
            if self.detect_urdu_word(word):
                translation = self.translate_word(word)
                words[index] = translation
        return words

    def detect_urdu_word(self, word: str):
        try:
            lang = self.detect(word)
            if lang=='ur':
                return True
        except:
            lang = False

    def translate_word(self, urdu_text: str):
        for entry_no in range(3):
            try:
                translation = translator.translate(urdu_text, src='ur', dest='en').text
                return translation
            except:
                continue
        return urdu_text

    def separate_urdu_nonurdu_words(self, words: list):
        urdu_words = []
        nonurdu_words = []
        for word in words:
            print(word)
            try:
                lang = self.detect(word)
            except:
                lang = "null"
            
            if lang == 'ur':
                urdu_words.append(word)
            elif lang != 'ur' and lang != 'null':
                nonurdu_words.append(word)

        return (urdu_words, nonurdu_words)

    def map_urdu_word_to_na(self, word: str):
        if self.detect_urdu_word(word):
            return pd.NA
        else:
            return word
words_processor = WordsProcessor()

class DetailsProvider:
    def get_month_year_str(self, datetime_val):
        month_name = datetime_val.month_name()
        year = datetime_val.year
        return f"{month_name}-{year}"

    def get_day_month_year_str(self, datetime_val):
        day = datetime_val.day
        month_name =datetime_val.month_name()
        year = datetime_val.year
        return f'{day}-{month_name}-{year}'
    
    def get_all_member_names(self, df):
        return list(df['username'].unique())
    
    def get_total_messages_count(self, df):
        return len(df['message'])
    
    def get_total_words_count(self, df):
        num_words = 0
        for message in df['message']:
            message_num_words = len(word_tokenize(message))
            num_words += message_num_words

        return num_words    

    def get_media_messages_count(self, df):
        total_media_count = 0
        for message in df['message']:
            if message.__contains__('<Media omitted>\n'):
                total_media_count+=1
        return total_media_count

    def get_urls_count(self, df):
        urls_count = 0
        for message in df['message']:
            urls = urlextract.find_urls(message)
            num_urls = len(urls)
            urls_count+=num_urls
        return urls_count
    
    def get_monthly_activity_df(self, df):
        monthly_activity = pd.DataFrame(df.groupby('month-year')['message'].count().reset_index())
        return monthly_activity.rename(columns={'month-year': 'month', 'message': 'num_messages'})
    
    def get_daily_activity_df(self, df):
        daily_activity = pd.DataFrame(df.groupby('day-month-year')['message'].count().reset_index())
        return daily_activity.rename(columns={'month-year': 'month', 'message': 'num_messages'})
    
    def get_most_busy_months_activity_df(self, df):
        u_months_activity = pd.DataFrame(df.groupby('month')['message'].count().reset_index())
        return u_months_activity.rename(columns={'month': 'month', 'message': 'num_messages'})
    
    def get_most_busy_days_activity_df(self, df):
        u_days_activity = pd.DataFrame(df.groupby('day-name')['message'].count().reset_index())
        return u_days_activity.rename(columns={'day-name': 'day', 'message': 'num_messages'})
    
    def get_hourwise_activity_df(self, df):
        hourwise_activity = pd.DataFrame(df.groupby(['day-name', 'time'])['message'].count().reset_index())
        return hourwise_activity.rename(columns={'day-name': 'day', 'message': 'num_messages'})
    
    def get_most_busy_users_activity_df(self, df):
        users_activity = pd.DataFrame(df.groupby('username')['message'].count().sort_values(ascending=False).reset_index())
        return users_activity.rename(columns={'message': 'num_messages'})
    
    def get_top_words_dict(self, words: list, limit: int):
        # perform words transformation here
        words = self.perform_occurence_count(words)
        return dict(words[:limit])

    def perform_occurence_count(self, words: list):
        occurence_count = dict(FreqDist(words))
        occurence_count = sorted(occurence_count.items(), key = lambda item: item[1], reverse=True)
        return occurence_count
    
    def get_wordcloud(self, df):
        # dict of most common words will be provided and return wordcloud
        return []
details_provider = DetailsProvider()

class MessageProcessor:
    def __init__(self):
        self.punctuations = string.punctuation
    def preprocess_messages(self, text: str):
        text = self.remove_url(text)
        text = self.replace_emojies(text)
        text = self.remove_punctuations(text)
        text = self.tokenize(text)
        text = self.remove_stopwords(text)
        text = self.remove_numbers(text)
        text = self.remove_1_char_words(text)
        return text

    def remove_url(self, text: str):
        url_pattern = r'https?://\S+|www\.\S+'
        return re.sub(url_pattern, "", text)

    def replace_emojies(self, text: str):
        return emoji.demojize(text)

    def remove_punctuations(self, text: str):
        for mark in self.punctuations:
            if mark in text:
                text = text.replace(mark, "")
        return text

    def tokenize(self, text: str):
        return word_tokenize(text)

    def remove_stopwords(text: list):
        stop_words = words_processor.get_stop_words_list()
        text = [word for word in text if word not in stop_words]
        return text

    def remove_numbers(self, text: list):
        return [word for word in text if not str(word).isnumeric()]

    def remove_1_char_words(self, text: str):
        return [word for word in text if not len(word) == 1]    
message_processor = MessageProcessor()

class DFPreparor:
    def __init__(self):
        self.df = pd.DataFrame()
    
    def prepare_df(self, chat_filepath):
        self.load_chat(chat_filepath)
        self.create_datetime_message_cols_from_chat()
        self.create_message_username_cols()
        self.create_insights_cols()
        return self.df
    
    def load_chat(self, chat_filepath):
        self.chat = self.read_txtfile(chat_filepath)

    def read_txtfile(self, filepath):
        if self.path_exists(filepath):
            with open(filepath, 'r') as file:
                contents = file.read()
            return contents
        else:
            raise Exception("File path doesn't exist.")

    def path_exists(self, path: str):
        return True if os.path.exists(path) else False

    def create_datetime_message_cols_from_chat(self):
        regex_for_datetime = "\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}\s[AP]M"
        messages = re.split(regex_for_datetime, self.chat)[1:]
        datetime = re.findall(regex_for_datetime, self.chat)
        self.df = pd.DataFrame({'datetime': datetime, 'message': messages})

    def create_message_username_cols(self):
        regex_for_username = '([\w\W]+?):\s'
        usernames = []
        messages = []

        for message in self.df['message']:
            message = re.split(regex_for_username, message)
            if len(message) > 1:
                username = message[1]
                message = message[2]
            else:
                username = 'group notification'
                message = message[0]
            username = self.remove_dash_strip(username)
            usernames.append(username)
            messages.append(message)

        self.df['username'] = usernames
        self.df['message'] = messages

    def remove_dash_strip(self, text: str):
        return text.replace("-", "").strip()    

    def create_insights_cols(self):
        self.df['datetime'] = pd.to_datetime(self.df['datetime'], format="%m/%d/%y, %I:%M %p")
        self.df['year'] = self.df['datetime'].dt.year
        self.df['month'] = self.df['datetime'].dt.month_name()
        self.df['day'] = self.df['datetime'].dt.day
        self.df['day-name'] = self.df['datetime'].dt.day_name()
        self.df['time'] = self.df['datetime'].dt.time
        self.df['month-year'] = self.df['datetime'].apply(details_provider.get_month_year_str)
        self.df['day-month-year'] = self.df['datetime'].apply(details_provider.get_day_month_year_str)
df_preparor = DFPreparor()

class InsightsProvider:
    def __init__(self, chat_filepath):
        self.gather_insights(chat_filepath)

    def gather_insights(self, chat_filepath):
        self.df = df_preparor.prepare_df()
        insights = {
        'all_members' : details_provider.get_all_member_names(self.df),
        'total_messages' : details_provider.get_total_messages_count(self.df),
        'total_words' : details_provider.get_total_words_count(self.df),
        'total_media_shared' : details_provider.get_media_messages_count(self.df),
        'total_links_shared' : details_provider.get_urls_count(self.df),
        'monthly_activity' : details_provider.get_monthly_activity_df(self.df),
        'daily_activity' : details_provider.get_daily_activity_df(self.df),
        'most_busy_days' : details_provider.get_most_busy_days_activity_df(self.df),
        'most_busy_months' : details_provider.get_most_busy_months_activity_df(self.df),
        'hourwise_activity' : details_provider.get_hourwise_activity_df(self.df),
        'most_busy_users' : details_provider.get_most_busy_users_activity_df(self.df),
        'most_common_words' : details_provider.get_top_words_dict(self.df)
        }
        return insights
    
    def get_topwords_dict_and_wordcloud_fig(self):
        all_words = []
        for message in self.df['message']:
            print(message)
            all_words.extend(message_processor.preprocess_messages(message))

        translated_words = words_processor.translate_urdu_words(all_words)

        words = pd.DataFrame()
        words['all_words'] = all_words
        words['translated_words'] = translated_words

        top_100_words_with_dict = words_processor.get_top_words(words['translated_words'], 100)
        top_100_words = " ".join(list(top_100_words_with_dict.keys()))

        wordcloud = WordCloud(height=800, width=1000, background_color='white', colormap='viridis')
        wordcloud = wordcloud.generate(top_100_words)
        plt.axis('off')
        
        return (top_100_words_with_dict, wordcloud)

SyntaxError: invalid syntax (1134224846.py, line 228)

#### Analysis

##### Main Header details

In [11]:
df = pd.read_csv("dataset.csv")
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [35]:
file = open(insights_filepath, "a", encoding='utf-8')

In [12]:
# main header details
all_members = list(df['username'].unique())
total_messages_count = len(df['message'])
total_words_count = details_provider.get_total_words_count(df)
total_media_count = get_media_messages_count(df)
total_urls_count = get_urls_count(df)

##### Activity-wise

In [44]:
# monthly activity
monthly_activity = pd.DataFrame(df.groupby('month-year')['message'].count().reset_index())
monthly_activity = monthly_activity.rename(columns={'month-year': 'month', 'message': 'num_messages'})
save_extracted_df('monthly_activity', monthly_activity)
monthly_activity.head()

Unnamed: 0,month,num_messages
0,April-2024,39
1,December-2023,69
2,February-2024,27
3,January-2024,119
4,July-2024,13


In [45]:
# daily activity
daily_activity = pd.DataFrame(df.groupby('day-month-year')['message'].count().reset_index())
daily_activity = daily_activity.rename(columns={'month-year': 'month', 'message': 'num_messages'})
save_extracted_df('daily_activity', daily_activity)
daily_activity.head()

Unnamed: 0,day-month-year,num_messages
0,1-December-2023,19
1,1-March-2024,4
2,10-January-2024,2
3,11-December-2023,1
4,11-January-2024,11


In [46]:
# unique months activity
u_months_activity = pd.DataFrame(df.groupby('month')['message'].count().reset_index())
u_months_activity = u_months_activity.rename(columns={'month': 'month', 'message': 'num_messages'})
save_extracted_df('u_months_activity', u_months_activity)
u_months_activity.head()

Unnamed: 0,month,num_messages
0,April,39
1,December,69
2,February,27
3,January,119
4,July,13


In [47]:
# unique days activity
u_days_activity = pd.DataFrame(df.groupby('day-name')['message'].count().reset_index())
u_days_activity = u_days_activity.rename(columns={'day-name': 'day', 'message': 'num_messages'})
save_extracted_df('u_days_activity', u_days_activity)
u_days_activity.head()

Unnamed: 0,day,num_messages
0,Friday,131
1,Monday,57
2,Saturday,240
3,Sunday,109
4,Thursday,114


In [48]:
# unique hours activity
hourwise_activity = pd.DataFrame(df.groupby(['day-name', 'time'])['message'].count().reset_index())
hourwise_activity = hourwise_activity.rename(columns={'day-name': 'day', 'message': 'num_messages'})
save_extracted_df('hourwise_activity', hourwise_activity)
hourwise_activity.head()

Unnamed: 0,day,time,num_messages
0,Friday,00:14:00,1
1,Friday,01:42:00,1
2,Friday,02:52:00,1
3,Friday,03:36:00,1
4,Friday,03:37:00,1


In [49]:
# users activity
users_activity = pd.DataFrame(df.groupby('username')['message'].count().sort_values(ascending=False).reset_index())
users_activity = users_activity.rename(columns={'message': 'num_messages'})
users_activity['username'] = users_activity['username'].apply(remove_dash_space_chars)
save_extracted_df('users_activity', users_activity)
users_activity.head()

Unnamed: 0,username,num_messages
0,Asif Urdu Call Bot,149
1,+92 301 7973326,100
2,group notification,41
3,+92 300 9884227,30
4,+966 54 318 6726,25


In [50]:
# chat % of each user
users_chat_percentage = users_activity.copy()
users_chat_percentage['num_messages'] = np.round(users_chat_percentage['num_messages'] / total_messages_count, 2)
save_extracted_df('users_chat_percentage', users_chat_percentage)
users_chat_percentage.head()

Unnamed: 0,username,num_messages
0,Asif Urdu Call Bot,0.19
1,+92 301 7973326,0.12
2,group notification,0.05
3,+92 300 9884227,0.04
4,+966 54 318 6726,0.03


##### Word cloud

In [13]:
# reloading data
df = pd.read_csv("dataset.csv")
df.dropna(inplace=True)
df.head(2)

Unnamed: 0.1,Unnamed: 0,datetime,message,username,year,month,day,day-name,time,month-year,day-month-year
0,0,2023-11-16 16:57:00,- Messages and calls are end-to-end encrypted...,group notification,2023,November,16,Thursday,16:57:00,November-2023,16-November-2023
1,1,2023-11-04 18:50:00,"- ~ Abdul Rehman zahid created group ""Ai Disc...",group notification,2023,November,4,Saturday,18:50:00,November-2023,4-November-2023


In [None]:
# # filtering words, saving them, generating and saving wordcloud
# all_words = []
# for message in df['message']:
#     print(message)
#     all_words.extend(preprocess_messages(message))

# translated_words = translate_urdu_words(all_words)

# words = pd.DataFrame()
# words['all_words'] = all_words
# words['translated_words'] = translated_words
# words.head()
# words.to_csv("words.csv", index=False)

# top_100_words_with_dict = get_top_words(words['translated_words'], 100)
# top_100_words = " ".join(list(top_100_words_with_dict.keys()))
# top_100_words_with_dict

# wordcloud = WordCloud(height=800, width=1000, background_color='white', colormap='viridis')
# wordcloud = wordcloud.generate(top_100_words)
# plt.axis('off')
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.savefig(create_path(extracted_data_dir, "wordcloud-of-top-100-words.png"), bbox_inches='tight')
# plt.show()