In [1]:
# All imports are in this cell
import re
from datetime import datetime

In [2]:
# All constants are in this cell
fileName = 'MachauWingiesChatData.txt'

In [3]:
def getDateTimeNameMessage(line):
    '''This function takes a line as input parameter
    and returns a tuple in the following order
    (date, time, name, message)
    date in dd/mm/yyyy format as string type
    time in hh:mm am/pm format as string type'''
    
    date = re.search("[0-9]{2}/[0-9]{2}/[0-9]{4}", line)
    time = re.search("[0-9]+:[0-9]{2}\s[ap]m", line)
    name = re.search("\s-\s(.*?):", line)
    message = re.search("\s-\s.*:\s(.+)", line)
    
    if date is not None and time is not None and name is not None and message is not None:   # This line contains a new data
        date = date.group(0)
        time = time.group(0)
        name = name.group(1)
        message = message.group(1)
    
    elif date is not None and time is not None and (name is None or message is None):  # Someone left a group message
        date = None
        time = None
        name = None
        message = None
    
    else:                           # This only contains message, continuation of previous message
        date = None
        time = None
        name = None
        message = line
    
    return (date, time, name, message)

In [31]:
def getSimplifiedChatData(filename):
    '''This function takes filename as input parameter
    and returns a list of all chat data
    with each value being a tuple in the order
    (date, time, name, message)
    
    This function merges large messages which come in new line in chat data text file'''
    
    chatDataTxt = open(filename, 'r', encoding="utf8") # opened as read only
    chatDataList = []                 # to store and return the simplified data
    
    for eachLine in chatDataTxt:
        dateTimeNameMsgTuple = getDateTimeNameMessage(eachLine)
        
        if dateTimeNameMsgTuple[0] is not None:          # New data found
            chatDataList.append(dateTimeNameMsgTuple)
        
        elif dateTimeNameMsgTuple[-1] is None:           # Message is None i.e someone left a group. Skip this data
            pass
        
        else:                                            # message continues from previous data
            newMsgForPreviousData = chatDataList[-1][-1] + dateTimeNameMsgTuple[-1]
            dateForPreviousData = chatDataList[-1][0]
            timeForPreviousData = chatDataList[-1][1]
            nameForPreviousData = chatDataList[-1][2]
            
            chatDataList[-1] = (dateForPreviousData, timeForPreviousData, nameForPreviousData, newMsgForPreviousData)
    
    return chatDataList

In [89]:
def GetBasicStats(chatDataList):
    ''' Input: list output from getSimplifiedData function
        Output: tuples of general stats (number of messages, Chat duration, total number of characters in message(including spaces)) 
    '''
    
    nMsg = len(chatDataList)
    ChatStartDate = datetime.strptime(chatDataList[0][0], '%d/%m/%Y')
    ChatEndDate = datetime.strptime(chatDataList[-1][0], '%d/%m/%Y')
    ChatDuration = (datetime.strptime(chatDataList[-1][0], '%d/%m/%Y') - datetime.strptime(Data[0][0], '%d/%m/%Y')).days
    
    nCharacters = 0 #including spaces
    
    for item in chatDataList:
        nCharacters = nCharacters + len(item[3])
        
    
    return (nMsg, ChatDuration, nCharacters)

In [None]:
def GetIndividualStats(chatDataList):
    '''
    Input: list output from getSimplifiedData function
    output: Dictionary {'member': members detailed stats}
    '''
    MembersData = {}
    
    
    for item in chatDataList:
        if item[2] not in MembersData:
            MembersData[item[2]] = []
            
        MembersData[item[2]].append(item)
        
    IndividualStats = {}
    for keys in MembersData.keys():
        IndividualStats{keys} = GetDetailedStats(MembersData[keys])
    
    
    return IndividualStats         
            
    
    

In [117]:
def GetDetailedStats(chatDataList):
    '''
    Input: list output from getSimplifiedData function
    Output: tuple (AvgMsgPerDay, AvgCharPerMsg, AvgCharPerDay, LenLongestMsg)
    '''
    
    (nMsg, ChatDuration, nCharacters) = GetBasicStats(chatDataList)
    
    AvgMsgPerDay = nMsg/ChatDuration
    AvgCharPerMsg = nCharacters/nMsg
    AvgCharPerDay = AvgMsgPerDay*AvgCharPerMsg
    
    LenLongestMsg = 0  
    for item in chatDataList:
        LenLongestMsg = max(LenLongestMsg, len(item[3]))
        
        
    return (AvgMsgPerDay, AvgCharPerMsg, AvgCharPerDay, LenLongestMsg)

In [29]:
def getAllParticipantsName(filename, includeCompleteName = False):
    '''This function returns a list of names of all the group participants
    Requirement is that they should have posted atleast a single message
    Name returned is the name saved in persons whose data has been shared
    
    Pass the filename to this function
    Optionally it accepts includeCompleteName parameter which if true returns complete name
    else just first name is included'''
    
    allChatDataSimplified = getSimplifiedData(filename)  #Getting simplified data
    allChatParticipants = set()                          #varible of set type to store all participants name
    
    for eachChatData in allChatDataSimplified:
        name = eachChatData[2]
        
        if includeCompleteName:                          #full name is required
            pass
        else:                                            #only first name is required
            name = name.split()[0]                       
        
        allChatParticipants.add(name)
    
    return list(allChatParticipants)  