<a href="https://colab.research.google.com/github/SriSatyaLokesh/some-app-for-whatsapp/blob/master/notebooks/analytics/Emoji_count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from google.colab import files
uploaded = files.upload()

Saving WhatsApp Chat with CSE-B.txt to WhatsApp Chat with CSE-B.txt


In [17]:
import pandas as pd
from itertools import chain, repeat
from string import digits
import re

chainer = chain.from_iterable
class Cleaner:
  def __init__(self,filename):
    self.filename = filename

  def _clean_data(self):
    """ 
    What it does?
    ----------------
      remove action messages and append new line without date format to previous message
      
      EXAMPLE : 
      17/07/19, 8:33 pm - +91 90633 88499: Sir,
      *We have a chance to write Cocubes exam later...?*
      CHANGED TO:
      17/07/19, 8:33 pm - +91 90633 88499: Sir, *We have a chance to write Cocubes exam later...?*
      
      Parameters:
      -----------
      df : dataframe
          
      Returns:
      --------
      raw_texts : list
                  list consists of all raw messages ignoring actions from the chat file
      
    """
    chat_file = self.filename
    with open(chat_file) as file:
      all_msgs = file.readlines()
      raw_texts = []
      for line in all_msgs:
        if re.match("^\d{2}/\d{2}/\d{2,4},\s\d{1,2}:\d{2}\s[ap]*[m]*",line):
          date_split = line.split(' - ') 
          user_split = date_split[1].split(':')
          if len(user_split) >= 2:
            raw_texts.append(line)
        else:
          try:
            raw_texts[-1] += line
          except Exception as err:
            print(line,raw_texts)
            print(err)

    return raw_texts 
      
  def _get_data(self):
    """ 
    What it does?
    ----------------
      Import whatsapp data and transform it to a dataframe
      
      Parameters:
      -----------
      -
          
      Returns:
      --------
      df : dataframe
          Dataframe of all messages with columns - ["users", "raw_text"]
      
    """
    raw_texts = self._clean_data()
    raw_messages = dict()
    for line in raw_texts:
      date_split = line.split(' - ')
      user_split = date_split[1].split(':')
      user = user_split[0]
      if user not in raw_messages:
        raw_messages[user] = [line]
      else:
        raw_messages[user].append(line)
    df = pd.DataFrame({'user': list(chainer(repeat(k, len(v)) for k,v in raw_messages.items())),
                   'raw_text': list(chainer(raw_messages.values()))}) 
    return df

  def _get_message(self,df):
    """ 
    What it does?
    ----------------
      new column is added to existing dataframe i.e clean_message which ignores datetime and user from the raw_message
      
      Parameters:
      -----------
      df : dataframe
          
      Returns:
      --------
      df : dataframe
          Dataframe of all messages with columns - ["users", "raw_text", "raw_message",]
      
    """
    df["raw_message"] = df.apply(lambda df : "".join(df['raw_text'].split(":")[2:]),axis=1)
    df["raw_message"] = df.apply(lambda df : df['raw_message'][:-1],axis=1)
    return df

  def _get_text_only_message(self,df):
    """ 
    What it does?
    ----------------
      new column is added to existing dataframe i.e text_only_message which ignores all emojis and numbers from the clean_message
      
      Parameters:
      -----------
      df : dataframe
          
      Returns:
      --------
      df : dataframe
          Dataframe of all messages with columns - ["users", "raw_text", "raw_message", "text_only_message"]
      
    """
    remove_digits = str.maketrans('', '', digits)
    df["text_only_message"] = df.apply(lambda df : df["raw_message"].encode('ascii', 'ignore').decode('ascii'),axis=1)     # removing emoji s from cleam_message
    df["text_only_message"] = df.apply(lambda df : df["text_only_message"].translate(remove_digits),axis=1)          # removing digits from clean_message

    df.loc[(df['text_only_message'] == " This message was deleted") | (df['text_only_message'] == " <Media omitted>"),'text_only_message'] = "" # updating "This message was deleted" & "media" to ""(empty_string)

    return df

  def _remove_inactive_users(self,df):
    """ 
    What it does?
    ----------------
      removes inactive users i.e user with no of messages < 10 from dataframe
      
      Parameters:
      -----------
      df : dataframe
          
      Returns:
      --------
      df : dataframe
          Dataframe of all messages with columns - ["users", "raw_text", "raw_message", "text_only_message"]
          removing inactive users
      
    """
    df = df.groupby('user').filter(lambda x : len(x) > 10)
    return df

  def _get_user_media_counts(self,df):
    """ 
    What it does?
    ----------------
      it counts individual user media that is sent in the chat
      
      Parameters:
      -----------
      df : dataframe
          
      Returns:
      --------
      user_media_counts : dictionary
                          key as 'user' : value as 'media_count'
      
    """
    media_count = df[df['raw_message'] == " <Media omitted>"].groupby('user').size()
    user_media_counts = media_count.to_dict()
    return user_media_counts

  def _get_datetime(self,df):
    """ 
    What it does?
    ----------------
      3 columns are added i.e 
      
      1. "date" : datetime from raw_text
      2. "hour" : Hour from date column
      3. "weekday" : day of the week from date column 
      
      Parameters:
      -----------
      df : dataframe
          
      Returns:
      --------
      df : dataframe
          Dataframe with only active user messages with columns - ["users", "raw_text", "raw_message", "text_only_message", "date", "hour", "weekday"]
      
    """

    df['date'] =  df.apply(lambda df : df['raw_text'].split(" - ")[0],axis=1)

    temp = ["%d/%m/%Y, %I:%M %p" , "%d/%m/%y, %I:%M %p" , "%d/%m/%Y, %H:%M" , "%d/%m/%y, %H:%M" ,
        "%d/%Y/%m, %I:%M %p" , "%d/%y/%m, %I:%M %p" , "%d/%Y/%m, %H:%M" , "%d/%y/%m, %H:%M" ,
        "%Y/%m/%d, %I:%M %p" , "%y/%m/%d, %I:%M %p" , "%Y/%m/%d, %H:%M" , "%y/%m/%d, %H:%M" ,
        "%Y/%d/%m, %I:%M %p" , "%y/%d/%m, %I:%M %p" , "%Y/%d/%m, %H:%M" , "%y/%d/%m, %H:%M" ,
        "%m/%Y/%d, %I:%M %p" , "%m/%y/%d, %I:%M %p" , "%m/%Y/%d, %H:%M" , "%m/%y/%d, %H:%M" ,
        "%m/%d/%Y, %I:%M %p" , "%m/%d/%y, %I:%M %p" , "%m/%d/%Y, %H:%M" , "%m/%d/%y, %H:%M"]

    for formats in temp:       
      try:
        df['date'] = pd.to_datetime(df['date'], format=formats)
      except:
        continue
    df['hour'] = df['date'].dt.hour
    df['weekday'] = df['date'].dt.weekday
    
    return df

chat_file = 'WhatsApp Chat with CSE-B.txt'
clean = Cleaner(chat_file)
df = clean._get_data()
df = clean._get_message(df)
df = clean._get_text_only_message(df)
df = clean._remove_inactive_users(df)
df = clean._get_datetime(df)
df

Unnamed: 0,user,raw_text,raw_message,text_only_message,date,hour,weekday
0,Pavan,"23/10/19, 9:22 pm - Pavan: <Media omitted>\n",<Media omitted>,,2019-10-23 21:22:00,21,2
1,Pavan,"23/10/19, 9:22 pm - Pavan: <Media omitted>\n",<Media omitted>,,2019-10-23 21:22:00,21,2
2,Pavan,"23/10/19, 9:22 pm - Pavan: <Media omitted>\n",<Media omitted>,,2019-10-23 21:22:00,21,2
3,Pavan,"23/10/19, 9:22 pm - Pavan: <Media omitted>\n",<Media omitted>,,2019-10-23 21:22:00,21,2
4,Pavan,"23/10/19, 9:22 pm - Pavan: <Media omitted>\n",<Media omitted>,,2019-10-23 21:22:00,21,2
...,...,...,...,...,...,...,...
1037,+91 95732 28032,"25/04/20, 2:37 pm - +91 95732 28032: We should...",We should tell them that the next time they w...,We should tell them that the next time they w...,2020-04-25 14:37:00,14,5
1038,+91 95732 28032,"25/04/20, 2:40 pm - +91 95732 28032: This mess...",This message was deleted,,2020-04-25 14:40:00,14,5
1039,+91 95732 28032,"25/04/20, 2:41 pm - +91 95732 28032: This mess...",This message was deleted,,2020-04-25 14:41:00,14,5
1040,+91 95732 28032,"07/05/20, 11:00 am - +91 95732 28032: Bro were...",Bro weren't they telling that they'll track t...,Bro weren't they telling that they'll track t...,2020-05-07 11:00:00,11,3


In [18]:
!pip install emoji



In [19]:
import emoji

def extract_emojis(str):
  l = "".join(c for c in str if c in emoji.UNICODE_EMOJI)
  return l

emoji_df = pd.DataFrame({'user':df['user'],'emoji':df['raw_message'].apply(extract_emojis)})
emoji_df.drop(emoji_df[emoji_df['emoji'] == ''].index, inplace = True) 
emoji_df

Unnamed: 0,user,emoji
24,Pavan,🍾🍾
25,Pavan,🍾
56,BlueBrain 🧠 => 💻,🙂
57,BlueBrain 🧠 => 💻,🙂
58,BlueBrain 🧠 => 💻,👍🏻
...,...,...
959,+91 99636 58876,😅
964,+91 99636 58876,😂😂🙏
965,+91 99636 58876,😶
1010,Mayuka,😹😹


In [20]:
emoji = pd.DataFrame({'user': emoji_df['user'],'emoji':emoji_df.groupby('user')['emoji'].transform(lambda x: ''.join(x))})
emoji = emoji[['user','emoji']].drop_duplicates()
emoji = emoji.set_index('user')
emoji

Unnamed: 0_level_0,emoji
user,Unnamed: 1_level_1
Pavan,🍾🍾🍾
BlueBrain 🧠 => 💻,🙂🙂👍🏻
+91 97052 34094,🎂🎂🎂🎂
Hemanth_VMEG,🤣🤣🤣🤣😧😧🤦♂🤣🤣🤣🤣🤣
+91 73963 22930,🥳🥳😍🥳🍻🥳🥰😎😍🎂😍🥳😎🤔🤔😂😂
Bharath_VMEG,😍🖤😍🖤
+91 73307 33799,🎊🎉🎊🎉🎉🎊🎉🎊🎂🎉🎊🎊🎉🎊🎉🎉🎊🎊🎉🤣😅😂🤣🤣🤣😂😤
+91 96420 00771,🎂🎂🎉🎂🎂🎂🎂🎂🎂🎉🎂🎂🎂🎂🎂🎉🎂🎂🎂🎂
Ajith,🎂😍😍🎂🎂
Niharika_V,😍❤😍😠


In [0]:
from collections import Counter 

In [0]:
def emoji_count(str):
  return Counter(str)

emoji['emoji'] = emoji['emoji'].apply(emoji_count)

In [23]:
emoji

Unnamed: 0_level_0,emoji
user,Unnamed: 1_level_1
Pavan,{'🍾': 3}
BlueBrain 🧠 => 💻,"{'🙂': 2, '👍': 1, '🏻': 1}"
+91 97052 34094,{'🎂': 4}
Hemanth_VMEG,"{'🤣': 9, '😧': 2, '🤦': 1, '♂': 1}"
+91 73963 22930,"{'🥳': 5, '😍': 3, '🍻': 1, '🥰': 1, '😎': 2, '🎂': ..."
Bharath_VMEG,"{'😍': 2, '🖤': 2}"
+91 73307 33799,"{'🎊': 9, '🎉': 9, '🎂': 1, '🤣': 4, '😅': 1, '😂': ..."
+91 96420 00771,"{'🎂': 17, '🎉': 3}"
Ajith,"{'🎂': 3, '😍': 2}"
Niharika_V,"{'😍': 2, '❤': 1, '😠': 1}"
