In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import emoji
import emojis
from collections import defaultdict

#export as html without code -> jupyter nbconvert Untitled.ipynb --no-input

In [None]:
def read_file(file):
    '''Reads Whatsapp text file into a list of strings'''
    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content
file_path = "" #complete here with your .txt path
chat = read_file(file_path)

In [None]:
#chat

In [None]:
#len(chat)

In [None]:
import datetime
date_patt = re.compile(r'(\d+\/\d+\/\d+)\,')
time_patt = re.compile(r'\s(\d+\:\d+\s[a-zA-z]{2})\s\-')
name_patt = re.compile(r'\-\s([a-zA-Z0-9]+\s?[a-zA-Z0-9]+\s?[a-zA-Z0-9]+\s?)\:\s')
def split_line(line):
    try:
        date = date_patt.search(line).group(1)
        time = time_patt.search(line).group(1)
        name = name_patt.search(line).group(1)
        msg = line.split(":")[2].lstrip().rstrip()
    except Exception:
        msg_data = {}
        # some data is missing so... ignore that message
    else:
        format_ = "%m/%d/%y"
        msg_data = {
            "date":datetime.datetime.strptime(date, format_),
            "time":time,
            "name":name,
            "message":msg
        }
    return msg_data

In [None]:
data = list()

In [None]:
for line in chat:
    data.append(split_line(line))
data = [entry for entry in data if entry]

In [None]:
#len(data)

In [None]:
df = pd.DataFrame(data)

In [None]:
#df.head(60)

In [None]:
df = df.sort_values(by="date")

In [None]:
senders = df.name.unique()

# Whatsapp chat analysis

### Messages sent per sender

In [None]:
df.groupby(by="name")["name"].count()

### Deleted messages per sender

In [None]:
df[df['message'].str.contains("deleted")].groupby(by="name")["name"].count()

### Media sent per sender

In [None]:
df[df['message'].str.contains("<Media omitted>")].groupby(by="name")["name"].count()

### Messages over time

In [None]:
messages_count_over_time = df.groupby(by=["date", "name"]).name.agg("count").to_frame("count_").reset_index().sort_values(by="date")

In [None]:
#messages_count_over_time.head(60)

In [None]:
import matplotlib.pyplot as plt
#plt.figure(figsize=(60,10))
freq = 3
fig, ax = plt.subplots(figsize=(30,10))
for sender in senders:
    tmp_df = messages_count_over_time[messages_count_over_time['name'] == sender]
    
    # Spot max count_ date
    max_data = tmp_df.loc[tmp_df['count_'].idxmax()]
    print(max_data.date, max_data.count_)
    ax.annotate('Date: {}\nCount: {}'.format(max_data.date, max_data.count_), xy=(max_data.date, max_data.count_))
    
    ax.plot(tmp_df.date, tmp_df.count_, label = sender)
    plt.xticks(rotation=90)
    ax.legend()
    ax.grid(True)
    plt.title("Messages sent over time")
    plt.xticks(tmp_df.date[::freq])

plt.savefig("over_time.png")

### Top N emojis per sender

In [None]:
def extract_emojis(columnname, my_df):
    # Credit 
    emojis=[]
    for string in my_df[columnname]:
        my_str = str(string)
        for each in my_str:
            if each in emoji.UNICODE_EMOJI:
                emojis.append(each)
    return emojis

In [None]:
emojis_per_sender = {}
for sender in senders:
    emojis_count = defaultdict(int)
    messages = df[df['name'] == sender][['name','message']]
    emojis = extract_emojis("message", messages)
    
    for e in emojis:
        emojis_count[e] += 1    
    
    emojis_per_sender[sender] = emojis_count    

In [None]:
N = 10
def pretty_print(dict_, n):
    for key, value in dict_.items():
        if n>=0:
            print(key, "--->", value)
            n-=1

for sender, emojis in emojis_per_sender.items():
    print("Top {} emojis sent by {}".format(N, sender))
    sorted_dict = {k: v for k, v in sorted(emojis.items(), key=lambda item: item[1], reverse=True)}
    pretty_print(sorted_dict,N)

In [None]:
#df.head()

In [None]:
## From deleted messages section and media sent section, we notice that we need to remove those expressions before building a corpus
df = df[(df['message'] != "<Media omitted>") & (~df['message'].str.contains("deleted"))]

In [None]:
#! pip install sklearn

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_string(string):
    pattern = re.compile(r'(\w+)')
    try:
        clean_string = pattern.search(string).group(1)
        a,b = 'áéíóúü','aeiouu'
        trans = str.maketrans(a,b)
        clean_string = clean_string.translate(trans)
    except AttributeError:
        clean_string = ""
    return clean_string    

### From some analysis, you can notice some expression that might be removed (e.g. laughter expressions). 
### In this step, I'm removing those expressions and also some meaningless word (lenght =< 2)
def build_vocab(corpus):
    ## Tokenize and merge
    vocab = list()
    tmp = list()
    for line in corpus:
        tmp.extend(line.split(" "))
    ## Normalize
        # To lowercase
    tmp = [word.lower() for word in tmp]
        # Remove punctuation and replace accented chars
    for w in tmp:
        cs = clean_string(w)
        if cs:
            vocab.append(cs)
    ## Remove stopwords
    stop_words = set(stopwords.words("spanish"))
    vocab = [w for w in vocab if w not in stop_words]
    vocab = [word for word in vocab if ("jaja" not in word) and (len(word)>2)]
    return vocab

def prepare_text(text):
    ## Tokenize and merge
    vocab = list()
    tmp = list()
    
    tmp = text.split(" ")
    ## Normalize
        # To lowercase
    tmp = [word.lower() for word in tmp]
        # Remove punctuation and replace accented chars
    for w in tmp:
        cs = clean_string(w)
        if cs:
            vocab.append(cs)
    ## Remove stopwords
    stop_words = set(stopwords.words("spanish"))
    vocab = [w for w in vocab if w not in stop_words]
    ## Remove meaningless expressions
    vocab = [word for word in vocab if ("jaja" not in word) and (len(word)>2)]
    
    return " ".join(vocab)

def generate_wordcloud(list_, sender, mask=False, mask_path=""):
    from wordcloud import WordCloud
    #convert list to string
    unique_string=(" ").join(list_)
    if mask:
        mask = np.array(Image.open(mask_path))
        wordcloud = WordCloud(width = 2000,mask=mask, height = 300, background_color="white").generate(unique_string)
    else:
        wordcloud = WordCloud(width = 2000, height = 500, background_color="white").generate(unique_string)
    
    plt.figure(figsize=(15,8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig("wordcloud_{}.png".format(sender))
    plt.show()    
    plt.close()    
    return

def word_freq(vocab):
    words = defaultdict(int)
    for word in vocab:
        words[word] += 1
    words = {k: v for k, v in sorted(words.items(), key=lambda item: item[1], reverse=True)}
    return words

In [None]:
#! python -m spacy download es_core_news_sm

In [None]:
#! pip install wordcloud

### Top N words by sender

In [None]:
N = 20
for sender in senders:
    tmp_df = df[df['name'] == sender][['name','message']]
    tmp_corpus = tmp_df.message
    vocab = build_vocab(tmp_corpus)
    word_frequencies = word_freq(vocab)
    print("Top {} words written by {}".format(N, sender))
    pretty_print(word_frequencies,N)

In [None]:
### Wordcloud per sender
from PIL import Image
import numpy as np
for sender in senders:
    print("WordCloud for {}".format(sender))
    tmp_df = df[df['name'] == sender][['name','message']]
    tmp_corpus = tmp_df.message
    vocab = build_vocab(tmp_corpus)
    vocab = [word for word in vocab if "jaja" not in word]
    mask = './wordcloud_shapes/mariposa.jpg'
    generate_wordcloud(vocab, sender, True, mask)
    

### WordCloud by date: What were you talking about on...?

In [None]:
date = "2020-05-04"
date_df = df[df['date'] == date][['date', 'name','message']]

In [None]:
print("Date: {}".format(date))

In [None]:
### Prepare messages
date_df['message'] = date_df['message'].apply(lambda x: prepare_text(x))
tmp_corpus = date_df.message
vocab = build_vocab(tmp_corpus)
generate_wordcloud(vocab, False) 

### Topic modelling

In [None]:
### Prepare messages
df['message'] = df['message'].apply(lambda x: prepare_text(x))

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def print_top_words(model, feature_names, n_top_words):
    # Credits: https://github.com/MaartenGr/soan/blob/master/whatsapp/topic.py
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def topics(df):
    # Credits: https://github.com/MaartenGr/soan/blob/master/whatsapp/topic.py
    # Create Topics
    for user in df.name.unique():
        print("#" * len(user) + "########")
        print("### " + user + " ###")
        print("#" * len(user) + "########\n")        
        
        data_samples = df[df.name == user].message
        data_samples = data_samples.tolist()
        # Extracting Features
        tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2)
        tf = tf_vectorizer.fit_transform(data_samples)

        # Fitting LDA
        topic_model = LatentDirichletAllocation(n_components=5, max_iter=15,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        topic_model.fit(tf)
        feature_names = tf_vectorizer.get_feature_names()
        
        
        print("\nTopics in LDA model:")
        print_top_words(topic_model, feature_names, 7)
        

In [None]:
topics(df)