In [None]:
import glob
import sqlite3
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
from textblob import TextBlob

#Read data and add dates
df = pd.read_csv('Data/msgstore.dbmessages.csv')
df['Date'] = pd.to_datetime(df['timestamp'],unit='ms') 

#Display biggest chats:
pd.DataFrame(df.groupby('key_remote_jid').size().sort_values(ascending=False).head(5))

# Make a map of all sent coordinates

In [None]:
import gmplot

def drawMap(df):
    fileName = dir_path + 'map.html'

    locFrame = df[(df.latitude != 0) | (df.longitude != 0)]

    latitude_list = [] 
    longitude_list = [] 

    for entry in range(0, a.shape[0]):
        latitude_list.append(locFrame.iloc[entry].latitude)
        longitude_list.append(locFrame.iloc[entry].longitude)

    gmap = gmplot.GoogleMapPlotter(latitude_list[0], longitude_list[0], 5) 

    gmap.scatter( latitude_list, longitude_list, '#3B0B39', size=20000, marker = True) 
    gmap.draw(fileName)
    
drawMap(df)

# Plot with all messages

In [None]:
def allPlot(df, grouper):
    times = pd.DatetimeIndex(df.Date)

    grouped = df.groupby(grouper(times)).size()
    
    fig, ax = plt.subplots()
    grouped.plot(kind='bar',figsize=(min(max(20, grouped.size / 2.0), 200), 5), ax=ax)
    plt.xlabel('Date')
    plt.ylabel('Messages')
    
allPlot(df, lambda times: times.dayofweek)
allPlot(df, lambda times: [times.year, times.month])

# Number of messages at different times

In [None]:
def seperated(df, grouper, start = 0):
    sns.set(rc={'figure.figsize':(11.7,8.27)})
    times = pd.DatetimeIndex(df.Date)
    
    df['hour'] = times.hour + times.minute/60 + times.second/3600
    df['hour'] = df['hour'] + df['hour'].apply(lambda x: 24 if x <= start else 0)
    
    grouped = df.groupby(grouper(times)).size()
    grouped.plot(kind='bar', figsize=(min(max(20, grouped.size / 2.0), 200), 6))
    plt.show()
    
    fig, ax = plt.subplots(figsize=(min(max(20, grouped.size / 2.0), 200), 10))
    sns.violinplot(x=grouper(times), y=times.hour, data=df, ax=ax)
    plt.ylim(0+start,24+start)
    plt.ylabel('hour')
    plt.show()
    
seperated(df, lambda times: times.dayofweek)
seperated(df, lambda times: times.month)
seperated(df, lambda times: times.year)

# Number of messages at different times by Chat

In [None]:
def seperatedByChat(df, minimum, grouper):
    group = df.groupby('key_remote_jid')
    Chats = []
    for name, frame in group:
        Chats.append([name, frame])
    
    chatFrame = pd.DataFrame()
    
    for Chat in Chats:
        times = pd.DatetimeIndex(Chat[1].Date)
        groupedChat = Chat[1].groupby(grouper(times)).size()        
        if(groupedChat.sum() >= minimum):
            chatFrame = pd.concat([chatFrame, groupedChat], axis = 1)
            chatFrame.columns = [Chat[0] if x==0 else x for x in chatFrame.columns]
    chatFrame = chatFrame.fillna(0)
    chatFrame.plot(kind='bar', stacked = True, figsize=(min(max(20, chatFrame.shape[0] / 2.0), 200), 7))
    plt.show()
    
seperatedByChat(df, 20000, lambda times: times.dayofweek)
seperatedByChat(df, 20000, lambda times: times.month)
seperatedByChat(df, 20000, lambda times: times.year)
seperatedByChat(df, 20000, lambda times: [times.year, times.month])

# Every message as a point showing day and exact time

In [None]:
def dayTime(df, frac=1, startTime=None):
    rcParams['figure.figsize'] = 450,6
    
    df = df[df.groupby('key_remote_jid')['key_remote_jid'].transform('size') > 30000]

    reducedDf = df.sample(frac=frac).drop_duplicates().sort_index().set_index('Date')
    times = pd.DatetimeIndex(reducedDf.index)
    reducedDf['date'] = times.date
    reducedDf['hour'] = times.hour + times.minute/60 + times.second/3600
    
    if(startTime != None):
        reducedDf = reducedDf[startTime:]

    sns.swarmplot(x='date', y='hour', data=reducedDf, hue='key_remote_jid')
    plt.ylim(0,24)
    plt.xticks(rotation='vertical')
    plt.tight_layout()
    plt.show()
    
dayTime(df, frac=1, startTime='1.1.2019')

# Line graph of messages

In [None]:
def lineGraph(df, minimum, grouper):
    group = df.groupby('key_remote_jid')
    Chats = []
    for name, frame in group:
        Chats.append([name, frame])
    
    chatFrame = pd.DataFrame()
    
    for Chat in Chats:
        times = pd.DatetimeIndex(Chat[1].Date)
        groupedChat = Chat[1].groupby(grouper(times)).size()        
        if(groupedChat.sum() >= minimum):
            chatFrame = pd.concat([chatFrame, groupedChat], axis = 1)
            chatFrame.columns = [Chat[0] if x==0 else x for x in chatFrame.columns]
    chatFrame = chatFrame.fillna(0)
    ax = chatFrame.plot(kind='line', figsize = (min(max(20, chatFrame.shape[0] / 10.0), 200), 5), y=chatFrame.columns)
    ax.get_legend().remove()

lineGraph(df, 20000, lambda times: [times.year, times.month])
lineGraph(df, 20000, lambda times: [times.year, times.month, times.day])
lineGraph(df, 20000, lambda times: [times.year, times.week])

# Finding a specific message

In [None]:
def dayTimeMessage(df, mask, frac=1, startTime=None):
    rcParams['figure.figsize'] = 40,6

    reducedDf = df.sample(frac=frac).drop_duplicates().sort_index().set_index('Date')
    
    times = pd.DatetimeIndex(reducedDf.index)
    reducedDf['date'] = times.date
    reducedDf['hour'] = times.hour + times.minute/60 + times.second/3600
    
    if(startTime != None):
        reducedDf = reducedDf[startTime:]
        
    reducedDf = reducedDf.loc[mask(reducedDf)]

    sns.swarmplot(x='date', y='hour', data=reducedDf, hue='key_remote_jid')
    plt.ylim(0,24)
    plt.xticks(rotation='vertical')
    plt.tight_layout()
    plt.show()
    
mask = lambda x: x.fillna('').data.str.contains('I love you')
dayTimeMessage(df, mask, frac=1)

In [None]:
def messageFrequency(df, mask, grouper, start = 0, startTime=None):
    sns.set(rc={'figure.figsize':(11.7,8.27)})
    
    times = pd.DatetimeIndex(df.Date)
    
    df['date'] = times.date
    df['time'] = times.hour + times.minute/60 + times.second/3600
    df['time'] = df['time'] + df['time'].apply(lambda x: 24 if x <= start else 0)
    df['hour'] = times.hour 
           
    if(startTime != None):
        df = df[startTime:]
        
    target = df.loc[mask(df)]
    targetTimes = pd.DatetimeIndex(target.Date)
    
    grouped = target.groupby(grouper(targetTimes)).size() / df.groupby(grouper(times)).size() * 100
    grouped.plot(kind='bar')
    plt.show()
    
    sns.violinplot(x=grouper(times), y=times.hour, data=df)
    plt.show()
           
mask = lambda x: x.fillna('').data.str.contains('I love you')
            
messageFrequency(df, mask, lambda times: times.dayofweek)
messageFrequency(df, mask, lambda times: times.month)
messageFrequency(df, mask, lambda times: times.year)
messageFrequency(df, mask, lambda times: times.hour)

In [None]:
def readTimeLine(df):
    read = pd.DataFrame()
    read['datesent'] = pd.to_datetime(df['receipt_server_timestamp'],unit='ms')
    read['dateread'] = pd.to_datetime(df['read_device_timestamp'],unit='ms')
    read['wait'] = read['dateread'] - read['datesent']
    read.dropna(inplace = True)
    
    fig, ax = plt.subplots(figsize=(40, 1))
    
    read = read[read.datesent > pd.to_datetime('1970-12-31 23:59:59.999')]
    read = read[((read.dateread - read.datesent) < pd.to_timedelta('25 h')) & ((read.dateread - read.datesent) > pd.to_timedelta('30 m'))]
    
    ax.set_xlim(read[read.datesent > pd.to_datetime('1970-12-31 23:59:59.999')].datesent.min(), read.dateread.max())
    
    markerline, stemline, baseline = ax.stem(read.datesent, read.wait / np.timedelta64(1, 'h'),'r', markerfmt='bx', linefmt='None')
    plt.show()
    
readTimeLine(df)

In [None]:
callsRaw = pd.read_csv('Data/msgstore.dbcall_log.csv')

calls = pd.DataFrame()

calls['datesent'] = pd.to_datetime(callsRaw['timestamp'],unit='ms')
calls['duration'] = pd.to_timedelta(callsRaw['duration'], unit='s')
calls['end'] = calls['datesent'] + calls['duration']
calls['video'] = callsRaw['video_call']
calls['result'] = callsRaw['call_result']
calls['person'] = callsRaw.jid_row_id

calls.dropna(inplace = True)

def callTimeLine(df):
    fig, ax = plt.subplots(figsize=(25, 3))
    
    ax.set_xlim(df[df.datesent > pd.to_datetime('1970-12-31 23:59:59.999')].datesent.min(), df.end.max())
    
    video = df[(df.video == 1) & (df.result != 2)]
    audio = df[(df.video == 0) & (df.result != 2)]
    rejected = df[df.result == 2]
    
    if len(audio.index) != 0: markerline, stemline, baseline = ax.stem(audio.datesent, audio.duration/np.timedelta64(1, 'h'), 'b', markerfmt='bx', linefmt='None', label = 'Audiocalls')
    if len(video.index) != 0: markerline, stemline, baseline = ax.stem(video.datesent, video.duration/np.timedelta64(1, 'h'), 'g', markerfmt='gx', linefmt='None', label = 'Videocalls')
    if len(rejected.index) != 0: markerline, stemline, baseline = ax.stem(rejected.datesent, np.zeros(len(rejected.index)), 'r', markerfmt='rx', linefmt='None', label = 'Rejected calls')
    ax.legend()
    
    plt.show()
    
callTimeLine(calls)

# Add Semantics to a DataFrame

In [None]:
def polarity(df):
    def textBB(string):
        return int(TextBlob(str(string)).sentiment.subjectivity)
    def textBA(string):
        return int(TextBlob(str(string)).sentiment.polarity)
        
    vectorB = np.vectorize(textBB)
    vectorA = np.vectorize(textBA)
    semanticsFrame = df.copy()
    
    semanticsFrame['polarity'] = vectorB(semanticsFrame['data'])
    semanticsFrame['subjectivity'] = vectorA(semanticsFrame['data'])
    
    return semanticsFrame

semanticsFrame = polarity(df)

# Plotting polarity against the number of messages

In [None]:
def test(aFrame, minimum):

    group = aFrame.groupby('key_remote_jid')
    Chats = []
    for name, frame in group:
        Chats.append([name, frame])

    chatFrame = pd.DataFrame()
    for Chat in Chats:

        groupedChat = Chat[1].groupby('key_remote_jid').size()
        groupedChat.rename(columns=['',Chat[0]])
        if(groupedChat.sum() >= 400):
            chatFrame = pd.concat([chatFrame, Chat[1]], axis = 0, sort=True)
            chatFrame.columns = [Chat[0] if x==0 else x for x in chatFrame.columns]
    chatFrame = chatFrame.fillna(0)
    newFrame = chatFrame.drop(chatFrame.columns.difference(['polarity', 'subjectivity', 'key_remote_jid', 'key_from_me']), 1, inplace=False).copy()
    newFrame['count'] = 1
    newFrame = newFrame.groupby(['key_remote_jid', 'key_from_me']).sum()
    newFrame['polarity'] = np.divide(newFrame['polarity'],newFrame['count'])
    newFrame['subjectivity'] = newFrame['subjectivity']/newFrame['count']
    
    sortedFrame = newFrame[newFrame['count'] > minimum].sort_values(by='polarity', ascending = False)
    sns.lmplot(x="polarity", y="count", data=sortedFrame);
    
test(semanticsFrame, 5000)

# Averarge polarity of sent and received messages over time period

In [None]:
def semanticsGraph(s):
    a=s[(s.key_from_me == 1)]

    times = pd.DatetimeIndex(a.Date)
    group = a.groupby([times.year, times.month]).mean()
    ax = group.plot(kind='line', y='polarity', color = 'green')

    a=s[(s.key_from_me == 0)]

    times = pd.DatetimeIndex(a.Date)
    group = a.groupby([times.year, times.month]).mean()

    group.plot(kind='line', y='polarity', ax = ax, color = 'orange')

    a2=s

    times = pd.DatetimeIndex(a2.Date)
    group2 = a2.groupby([times.year, times.month]).size()
    (group2/group2.max()*group['polarity'].max()).plot(kind='bar', figsize=(20,4))

    ax.legend(['Polarity from me', 'Polarity to me', 'Messages'])
    
semanticsGraph(semanticsFrame)