In [1]:
# All imports are in this cell
import re
from datetime import datetime

In [2]:
# All constants are in this cell
fileName = 'MachauWingiesChatData.txt'

In [74]:
def getDateTimeNameMessage(line):
    '''This function takes a line as input parameter
    and returns a tuple in the following order
    (date, time, name, message)
    date as datetime type
    time in hh:mm am/pm format as string type'''
    
    date = re.search("[0-9]{2}/[0-9]{2}/[0-9]{4}", line)
    time = re.search("[0-9]+:[0-9]{2}\s[ap]m", line)
    name = re.search("\s-\s(.*?):", line)
    message = re.search("\s-\s.*:\s(.+)", line)
    
    if date is not None and time is not None and name is not None and message is not None:   # This line contains a new data
        date = datetime.strptime(date.group(0), '%d/%m/%Y')
        time = time.group(0)
        name = name.group(1)
        message = message.group(1)
    
    elif date is not None and time is not None and (name is None or message is None):  # Someone left a group message
        date = None
        time = None
        name = None
        message = None
    
    else:                           # This only contains message, continuation of previous message
        date = None
        time = None
        name = None
        message = line
    
    return (date, time, name, message)

In [75]:
def getSimplifiedChatData(filename):
    '''This function takes filename as input parameter
    and returns a list of all chat data
    with each value being a tuple in the order
    (date, time, name, message)
    
    This function merges large messages which come in new line in chat data text file'''
    
    chatDataTxt = open(filename, 'r', encoding="utf8") # opened as read only
    chatDataList = []                 # to store and return the simplified data
    
    for eachLine in chatDataTxt:
        dateTimeNameMsgTuple = getDateTimeNameMessage(eachLine)
        
        if dateTimeNameMsgTuple[0] is not None:          # New data found
            chatDataList.append(dateTimeNameMsgTuple)
        
        elif dateTimeNameMsgTuple[-1] is None:           # Message is None i.e someone left a group. Skip this data
            pass
        
        else:                                            # message continues from previous data
            newMsgForPreviousData = chatDataList[-1][-1] + dateTimeNameMsgTuple[-1]
            dateForPreviousData = chatDataList[-1][0]
            timeForPreviousData = chatDataList[-1][1]
            nameForPreviousData = chatDataList[-1][2]
            
            chatDataList[-1] = (dateForPreviousData, timeForPreviousData, nameForPreviousData, newMsgForPreviousData)
    
    return chatDataList

In [76]:
def getAllParticipantsName(filename, includeCompleteName = False):
    '''This function returns a list of names of all the group participants
    Requirement is that they should have posted atleast a single message
    Name returned is the name saved in persons whose data has been shared
    
    Pass the filename to this function
    Optionally it accepts includeCompleteName parameter which if true returns complete name
    else just first name is included'''
    
    allChatDataSimplified = getSimplifiedChatData(filename)  #Getting simplified data
    allChatParticipants = set()                          #varible of set type to store all participants name
    
    for eachChatData in allChatDataSimplified:
        name = eachChatData[2]
        
        if includeCompleteName:                          #full name is required
            pass
        else:                                            #only first name is required
            name = name.split()[0]                       
        
        allChatParticipants.add(name)
    
    return list(allChatParticipants)  

In [77]:
def GetBasicStats(chatDataList):
    ''' Input: list output from getSimplifiedChatData function
        Output: tuples of general stats (number of messages, Chat duration, total number of characters in
                message(including spaces), total number of words, total number of media content) 
    '''
    
    nMsg = len(chatDataList)
    ChatDuration = (chatDataList[-1][0] - chatDataList[0][0]).days + 1
    
    nCharacters = 0 #including spaces
    nWords = 0
    nMedia = 0
    
    for item in chatDataList:
        nCharacters = nCharacters + len(item[3])
        nWords = nWords + len(item[3].split())
        if item[3] == "<Media omitted>":
            nMedia = nMedia + 1
        
        
    
    return (nMsg, ChatDuration, nCharacters, nWords, nMedia)

In [78]:
def GetDetailedStats(chatDataList):
    '''
    Input: list output from getSimplifiedChatData function
    Output: tuple (AvgMsgPerDay, AvgCharPerMsg, AvgCharPerDay, LenLongestMsg, AvgWordsPerMsg, AvgWordsPerDay, AvgMediaPerDay)
    '''
    
    (nMsg, ChatDuration, nCharacters, nWords, nMedia) = GetBasicStats(chatDataList)
    
    AvgMsgPerDay = int(nMsg/ChatDuration)
    AvgCharPerMsg = int(nCharacters/nMsg)
    AvgCharPerDay = int(AvgMsgPerDay*AvgCharPerMsg)
    AvgWordsPerMsg = int(nWords/nMsg)
    AvgWordsPerDay = int(AvgWordsPerMsg*AvgMsgPerDay)
    AvgMediaPerDay = int(nMedia/ChatDuration)
    
    LenLongestMsg = 0  
    for item in chatDataList:
        LenLongestMsg = max(LenLongestMsg, len(item[3]))
        
        
    return (AvgMsgPerDay, AvgCharPerMsg, AvgCharPerDay, LenLongestMsg, AvgWordsPerMsg, AvgWordsPerDay, AvgMediaPerDay)

In [79]:
def GetIndividualDataDistribution(chatDataList):
    '''
    Input: list output from getSimplifiedChatData function
    output: Dictionary[First Name as key]: ChatData for that Individual
    '''
    MembersData = {}
    
    
    for item in chatDataList:
        firstName = item[2].split()[0]
        if firstName not in MembersData:
            MembersData[firstName] = []
            
        MembersData[firstName].append(item)
        
    
    
    
    return MembersData
    

In [80]:
def getIndividualStats(chatDataList):
    '''
    Input: list output from getSimplifiedChatData function
    Output: Dictionary[First Name as key]: Detailed Stats of chat for that Individual
    '''
    
    MembersData = GetIndividualDataDistribution(chatDataList)
    
    IndividualStats = {}
    
    for keys in MembersData.keys():
        IndividualStats[keys] = GetDetailedStats(MembersData[keys])
        
    return IndividualStats
    

In [81]:
def getDayWiseDataDistribution(chatDataList):
    '''
    Input: list output from getSimplifiedChatData function
    Output: Dictionary[Date in datetime as key]: ChatData for that Date
    '''
    
    DayWiseDistribution = {}
    
    for item in chatDataList:
        if item[0] not in DayWiseDistribution:
            DayWiseDistribution[item[0]] = []
            
        DayWiseDistribution[item[0]].append(item)
        
    return DayWiseDistribution
        
    
    

In [88]:
def getDayWiseStats(chatDataList):
    '''
    Input: list output from getSimplifiedChatData function
    Output: Dictionary[Date in datetime type as key]: Detailed Stats of chat for that Date
    '''
    DayWiseDistribution = getDayWiseDataDistribution(chatDataList)
    
    DayWiseStats = {}
    
    for key in DayWiseDistribution.keys():
        DayWiseStats[key] = GetDetailedStats(DayWiseDistribution[key])
        
        
    return DayWiseStats

In [94]:
def getDayWisePersonWiseDistribution(chatDataList):
    '''
    Input: list output from getSimplifiedChatData function
    Output: Dictionary[Date in datetime type as key]: Dictionary[First Name as key]: ChatData for that Individual that day
    '''
    
    DayWiseDistribution = getDayWiseDataDistribution(chatDataList)
    
    DayMemberDistribution = {}
    
    for key in DayWiseDistribution.keys():
        DayMemberDistribution[key] = GetIndividualDataDistribution(DayWiseDistribution[key])
        
    return DayMemberDistribution
    

In [95]:
def getDayWisePersonWiseStats(chatDataList):
    '''
    Input: list output from getSimplifiedChatData function
    Output: Dictionary[Date in datetime type as key]: Dictionary[First Name as key]: DetailedStats for that Individual that day
    '''
    
    DayWiseDistribution = getDayWiseDataDistribution(chatDataList)
    
    DayMemberStats = {}
    
    for key in DayWiseDistribution.keys():
        DayMemberStats[key] = getIndividualStats(DayWiseDistribution[key])
        
    return DayMemberStats

In [96]:
Data = getSimplifiedChatData(fileName)

In [51]:
a = getDayWiseDataDistribution(Data)

In [64]:
b = getDayWiseStats(Data)

In [97]:
c = getDayWisePersonWiseStats(Data)

In [101]:
c[datetime(2020, 4, 3, 0, 0)]["Apoorv"]

(5, 16, 80, 25, 3, 15, 1)