### New Email Analysis for my gmail data collected on the 16th of March, 2024.

In [None]:
# Import libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import mailbox

In [None]:
# Put mbox file into a variable
mboxfile = 'All mail Including Spam and Trash.mbox'
mbox = mailbox.mbox(mboxfile)
mbox 

In [None]:
type(mbox)

In [None]:
print(mbox)

In [None]:
# List of available keys in the mbox file
for key in mbox[0].keys():
    print(key)

#### Data Transformation

In [None]:
# Data Cleaning
import csv 

In [None]:
# Create a CSV file with only the required attributes:
with open('mailbox.csv', 'w') as outputfile:
    writer = csv.writer(outputfile)
    writer.writerow(['subject', 'from', 'date', 'to', 'label', 'thread'])
    for message in mbox:
        writer.writerow([
            message['subject'], message['from'], message['date'],
            message['to'], message['X-Gmail-Labels'], message['X-GM-THRID']
        ])

In [None]:
# Load the csv file
dfs = pd.read_csv('mailbox.csv')

In [None]:
dfs.head()

In [None]:
# Summary info
dfs.info()

In [None]:
# Convert date feature to a datetime format
dfs['date'] = dfs['date'].apply(lambda x: pd.to_datetime(x,
errors='coerce', utc=True))

In [None]:
dfs.info()

In [None]:
# Checking for missing values in the date column
dfs['date'].isnull().sum()

In [None]:
missingData = dfs[dfs['date'].isnull()]
missingData.head()

In [None]:
# Removing NaN
dfs = dfs[dfs['date'].notna()]

In [None]:
dfs['date'].isnull().sum()

In [None]:
# Saving to csv file
dfs.to_csv('gmail.csv')

In [None]:
dfs.info()

In [None]:
dfs.head(10)

#### Data refactoring

In [None]:
# import regular expression
import re 

In [None]:
# let's create a function that takes an entire string from any column and
# extracts an email address:
def extract_email_ID(string):
    email = re.findall(r'<(.+?)>', string)
    if not email:
        email = list(filter(lambda y: '@' in y, string.split()))
    return email[0] if email else np.nan

In [None]:
dfs['from'] = dfs['from'].apply(lambda x: extract_email_ID(x))

In [None]:
dfs.head()

In [None]:
myemail = ''
dfs['label'] = dfs['from'].apply(lambda x: 'sent' if x==myemail
else 'inbox')

In [None]:
dfs.head()

In [None]:
# Drop the to column
dfs.drop(columns='to', inplace=True)

In [None]:
dfs.head()

In [None]:
# Refactor the date column
import datetime
import pytz

In [None]:
# List of all time zones
#pytz.all_timezones

In [None]:
def refactor_timezone(x):
    est = pytz.timezone('Africa/Lagos')
    return x.astimezone(est)

In [None]:
dfs['date'] = dfs['date'].apply(lambda x: refactor_timezone(x))

In [None]:
dfs.head()

In [None]:
# Convert the day of the week into names of the day of the week
dfs['dayofweek'] = dfs['date'].apply(lambda x: x.day_name())

In [None]:
dfs.head()

In [None]:
# Convert to dayofweek to category
dfs['dayofweek'] = dfs.dayofweek.astype('category')

In [None]:
dfs.info()

In [None]:
# Refactor for time of day
dfs['timeofday'] = dfs['date'].apply(lambda x: x.hour + x.minute/60 + x.second/3600)

In [None]:
# Refactor for hour
dfs['hour'] = dfs['date'].apply(lambda x: x.hour)

In [None]:
# Refactor for year integer
dfs['year_int'] = dfs['date'].apply(lambda x: x.year)

In [None]:
# Refactor for year fraction
dfs['year'] = dfs['date'].apply(lambda x: x.year + x.dayofyear/365.25)

In [None]:
dfs.head()

In [None]:
dfs.to_csv('gmail_data.csv')

In [None]:
# Set date to index
dfs.index = dfs['date']

In [None]:
# Delete the date column because it's no longer relevant
del dfs['date']

In [None]:
dfs.head()