### New Email Analysis for my gmail data collected on the 2nd of November, 2021.

In [None]:
# Import libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import mailbox

In [None]:
# Put mbox file into a variable
mboxfile = 'All mail Including Spam and Trash.mbox'
mbox = mailbox.mbox(mboxfile)
mbox 

In [None]:
type(mbox)

In [None]:
print(mbox)

In [None]:
# List of available keys in the mbox file
for key in mbox[0].keys():
    print(key)

#### Data Transformation

In [None]:
# Data Cleaning
import csv 

In [None]:
# Create a CSV file with only the required attributes:
with open('mailbox.csv', 'w') as outputfile:
    writer = csv.writer(outputfile)
    writer.writerow(['subject', 'from', 'date', 'to', 'label', 'thread'])
    for message in mbox:
        writer.writerow([
            message['subject'], message['from'], message['date'],
            message['to'], message['X-Gmail-Labels'], message['X-GM-THRID']
        ])

In [None]:
# Load the csv file
dfs = pd.read_csv('mailbox.csv')

In [None]:
dfs.head()

In [None]:
# Summary info
dfs.info()

In [None]:
# Convert date feature to a datetime format
dfs['date'] = dfs['date'].apply(lambda x: pd.to_datetime(x,
errors='coerce', utc=True))

In [None]:
dfs.info()

In [None]:
# Checking for missing values in the date column
dfs['date'].isnull().sum()

In [None]:
missingData = dfs[dfs['date'].isnull()]
missingData.head()

In [None]:
# Removing NaN
dfs = dfs[dfs['date'].notna()]

In [None]:
dfs['date'].isnull().sum()

In [None]:
# Saving to csv file
dfs.to_csv('gmail.csv')

In [None]:
dfs.info()

In [None]:
dfs.head(10)

#### Data refactoring

In [None]:
# import regular expression
import re 

In [None]:
# let's create a function that takes an entire string from any column and
# extracts an email address:
def extract_email_ID(string):
    email = re.findall(r'<(.+?)>', string)
    if not email:
        email = list(filter(lambda y: '@' in y, string.split()))
    return email[0] if email else np.nan

In [None]:
dfs['from'] = dfs['from'].apply(lambda x: extract_email_ID(x))

In [None]:
dfs.head()

In [None]:
myemail = '' #Your email here
dfs['label'] = dfs['from'].apply(lambda x: 'sent' if x==myemail
else 'inbox')

In [None]:
dfs.head()

In [None]:
# Drop the to column
dfs.drop(columns='to', inplace=True)

In [None]:
dfs.head()

In [None]:
# Refactor the date column
import datetime
import pytz

In [None]:
# List of all time zones
#pytz.all_timezones

In [None]:
def refactor_timezone(x):
    est = pytz.timezone('Africa/Lagos')
    return x.astimezone(est)

In [None]:
dfs['date'] = dfs['date'].apply(lambda x: refactor_timezone(x))

In [None]:
dfs.head()

In [None]:
# Convert the day of the week into names of the day of the week
dfs['dayofweek'] = dfs['date'].apply(lambda x: x.day_name())

In [None]:
dfs.head()

In [None]:
# Convert to dayofweek to category
dfs['dayofweek'] = dfs.dayofweek.astype('category')

In [None]:
dfs.info()

In [None]:
# Refactor for time of day
dfs['timeofday'] = dfs['date'].apply(lambda x: x.hour + x.minute/60 + x.second/3600)

In [None]:
# Refactor for hour
dfs['hour'] = dfs['date'].apply(lambda x: x.hour)

In [None]:
# Refactor for year integer
dfs['year_int'] = dfs['date'].apply(lambda x: x.year)

In [None]:
# Refactor for year fraction
dfs['year'] = dfs['date'].apply(lambda x: x.year + x.dayofyear/365.25)

In [None]:
dfs.head()

In [None]:
dfs.to_csv('gmail_data.csv')

In [None]:
# Set date to index
dfs.index = dfs['date']

In [None]:
# Delete the date column because it's no longer relevant
del dfs['date']

In [None]:
dfs.head()

#### Data Analysis

This is the most important part of EDA. This is the part where we gain insights from thedata that we have.Let's answer the following questions one by 

1. How many emails did I send during a given timeframe? 

2. At what times of the day do I send and receive emails with Gmail?

3. What is the average number of emails per day?

4. What is the average number of emails per hour?

5. Whom do I communicate with most frequently?

6. What are the most active emailing days?

7. What am I mostly emailing about?

In [None]:
# 1.How many emails did I send during a given timeframe?
print(dfs.index.min().strftime('%a, %d %b %Y %I:%M %p'))
print(dfs.index.max().strftime('%a, %d %b %Y %I:%M %p'))

In [None]:
# Count sent and received emails
print(dfs['label'].value_counts())

In [None]:
# 2.At what times of the day do I send and receive emails with Gmail?
# Create graph to view sent and recieved emails
# Create two sub dataframes
sent = dfs[dfs['label']=='sent']
received = dfs[dfs['label']=='inbox']

In [None]:
# import required libraries for graphing
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import FixedLocator
from scipy import ndimage
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpaches

In [None]:
# Create a function that takes a dataframe as an input and creates a plot
def plot_todo_vs_year(df, ax, color='C0', s=0.5, title=''):
    ind = np.zeros(len(df), dtype='bool')
    est = pytz.timezone('Africa/Lagos')
    df[~ind].plot.scatter('year', 'timeofday', s=s,
                          alpha=0.6, ax=ax, color=color)
    ax.set_ylim(0, 24)
    ax.yaxis.set_major_locator(MaxNLocator(8))
    ax.set_yticklabels([datetime.datetime.strptime(
        str(int(np.mod(ts, 24))), '%H').strftime('%I %p') for ts in ax.get_yticks()]);
    
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(title)
    ax.grid(ls=':',color='k')
    
    return ax

In [None]:
# Plot both received and sent emails
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(15,4))
plot_todo_vs_year(sent, ax[0], title='Sent')
plot_todo_vs_year(received, ax[1], title='Received');

In [None]:
# 3. What is the average number of emails per day?
# Create a function that counts the total number of emails per day
def plot_number_perday_per_year(df,ax,label=None,dt=0.3,**plot_kwargs):
    year = df[df['year'].notna()]['year'].values
    T = year.max()-year.min()
    bins = int(T/dt)
    weights = 1/(np.ones_like(year)*dt*365.25)
    ax.hist(year,bins=bins,weights=weights,label=label,**plot_kwargs);
    ax.grid(ls=':',color='k')

In [None]:
# Create a function that plots the average number of emails per day
def plot_number_perdhour_per_year(df, ax, label=None, dt=1, smooth=False,weight_fun=None, **plot_kwargs):    
    tod = df[df['timeofday'].notna()]['timeofday'].values    
    year = df[df['year'].notna()]['year'].values    
    Ty = year.max() - year.min()    
    T = tod.max() - tod.min()    
    bins = int(T / dt) 
    if weight_fun is None:
        weights = 1 / (np.ones_like(tod) * Ty * 365.25 / dt) 
    else:
        weights = weight_fun(df) 
    if smooth:        
        hst, xedges = np.histogram(tod, bins=bins, weights=weights);        
        x = np.delete(xedges, -1) + 0.5*(xedges[1] - xedges[0])        
        hst = ndimage.gaussian_filter(hst, sigma=0.75)        
        f = interp1d(x, hst, kind='cubic')        
        x = np.linspace(x.min(), x.max(), 10000)        
        hst = f(x)        
        ax.plot(x, hst, label=label, **plot_kwargs) 
    else:        
        ax.hist(tod, bins=bins, weights=weights, label=label, **plot_kwargs);    
        ax.grid(ls=':', color='k')    
        orientation = plot_kwargs.get('orientation') 
        if orientation is None or orientation == 'vertical':        
            ax.set_xlim(0, 24)        
            ax.xaxis.set_major_locator(MaxNLocator(8))
            ax.set_xticklabels([datetime.datetime.strptime(str(int(np.mod(ts, 24))), "%H").strftime("%I %p") 
                                                                          for ts in ax.get_xticks()]); 
        elif orientation == 'horizontal':        
            ax.set_ylim(0, 24)        
            ax.yaxis.set_major_locator(MaxNLocator(8))
            ax.set_yticklabels([datetime.datetime.strptime(str(int(np.mod(ts,24))), "%H").strftime("%I %p")
                            for ts in ax.get_yticks()]);

In [None]:
# Create a class that plots the time of the day versus year for all the emails within the given timeframe
class TriplePlot:  
    def __init__(self):   
        gs = gridspec.GridSpec(6, 6)    
        self.ax1 = plt.subplot(gs[2:6, :4])    
        self.ax2 = plt.subplot(gs[2:6, 4:6], sharey=self.ax1)    
        plt.setp(self.ax2.get_yticklabels(), visible=False);    
        self.ax3 = plt.subplot(gs[:2, :4])    
        plt.setp(self.ax3.get_xticklabels(), visible=False);  
        
    def plot(self, df, color='darkblue', alpha=0.8, markersize=0.5,yr_bin=0.1, hr_bin=0.5):    
        plot_todo_vs_year(df, self.ax1, color=color, s=markersize)    
        plot_number_perdhour_per_year(df, self.ax2, dt=hr_bin,color=color, alpha=alpha, orientation='horizontal')    
        self.ax2.set_xlabel('Average emails per hour')    
        plot_number_perday_per_year(df, self.ax3, dt=yr_bin,color=color, alpha=alpha)    
        self.ax3.set_ylabel('Average emails per day')

In [None]:
# Instantiate the class to plot the graph
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
plt.figure(figsize=(12,12));
tpl = TriplePlot()
tpl.plot(received, color='C0', alpha=0.5)
tpl.plot(sent, color='C1', alpha=0.5)
p1 = mpatches.Patch(color='C0', label='Incoming', alpha=0.5)
p2 = mpatches.Patch(color='C1', label='Outgoing', alpha=0.5)
plt.legend(handles=[p1, p2], bbox_to_anchor=[1.45, 0.7],fontsize=14, shadow=True);

In [None]:
# 4. What is the average number of emails per day?
counts = dfs.dayofweek.value_counts(sort=False)
counts.plot(kind='bar')

In [None]:
# 5.The most active days for receiving and sending emails separately
sdw = sent.groupby('dayofweek').size()/len(sent)
rdw = received.groupby('dayofweek').size()/len(received)

df_tmp = pd.DataFrame(data={'Outgoing Email':sdw, 'Incoming Email':rdw})
df_tmp.plot(kind='bar',rot=45,figsize=(8,5),alpha=0.5)
plt.xlabel('');
plt.ylabel('Fraction of weekly emails');
plt.grid(ls=':',color='k',alpha=0.5)

In [None]:
# 6. The most active time of day for email communication
import scipy.ndimage
from scipy.interpolate import interp1d

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.figure(figsize=(8, 5))
ax = plt.subplot(111)
for ct, dow in enumerate(dfs.dayofweek.cat.categories):
    df_r = received[received['dayofweek'] == dow]
    weights = np.ones(len(df_r))/len(received)
    def wfun(x): return weights
    plot_number_perdhour_per_year(
        df_r, ax, dt=1, smooth=True, color=f'C{ct}', alpha=0.8, lw=3, label=dow, weight_fun=wfun)

    df_s = sent[sent['dayofweek'] == dow]
    weights = np.ones(len(df_s)) / len(sent)
    def wfun(x): return weights
    plot_number_perdhour_per_year(
        df_s, ax, dt=1, smooth=True, color=f'C{ct}', alpha=0.8, lw=2, label=dow, ls='--', weight_fun=wfun)
    ax.set_ylabel('Fraction of weekly emails per hour')
    ax.xaxis.set_major_locator(MaxNLocator(10))
    ax.set_xticklabels([datetime.datetime.strptime(
        str(int(np.mod(ts, 24))), "%H").strftime("%I %p") for ts in ax.get_xticks()])
    plt.legend(loc='upper left')
    plt.grid(ls=':',color='k',alpha=0.5)

In [None]:
!pip install wordcloud

In [None]:
# 7. What am I mostly emailing about?
from wordcloud import WordCloud

In [None]:
#df_no_arxiv = dfs[dfs['from'] != 'no-reply@arXiv.org']
text = ' '.join(map(str, sent['subject'].values))
stopwords = ['Re', 'Fwd', '3A_']
wrd = WordCloud(width=700, height=480, margin=0, collocations=False)
for sw in stopwords:    
    wrd.stopwords.add(sw)
    wordcloud = wrd.generate(text)

plt.figure(figsize=(25,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)

In [None]:
df_no_arxiv = dfs[dfs['from'] != 'no-reply@arXiv.org']
text = ' '.join(map(str, sent['subject'].values))
stopwords = ['Re', 'Fwd', '3A_']
wrd = WordCloud(width=700, height=480, margin=0, collocations=False)
for sw in stopwords:    
    wrd.stopwords.add(sw)
    wordcloud = wrd.generate(text)

plt.figure(figsize=(25,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)

In [None]:
# What I am mostly receiving emails about
text = ' '.join(map(str, received['subject'].values))
stopwords = ['Re', 'Fwd', '3A_']
wrd = WordCloud(width=700, height=480, margin=0, collocations=False)
for sw in stopwords:    
    wrd.stopwords.add(sw)
    wordcloud = wrd.generate(text)

plt.figure(figsize=(25,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)