# Creating DataFrame
<br> 1. Loading RAW data
<br> 2. Converting it to DataFrame and creating relevant columns from the content
<br> 3. Deleting rows with empty values
<br> 4. Save the DataFrame to a file for easy opening (no need to convert it every time)


In [None]:
import email
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output

In [None]:
pd.options.mode.chained_assignment = None
emails_df = pd.read_csv('./emails.csv')


In [None]:
emails_df.head()

In [None]:
def get_text_from_email(msg: email.message.Message) -> str:
    """To get the content from email objects
    :param msg: the email object
    :return: the content of the email
    """
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line: str) -> frozenset:
    """
    To separate multiple email addresses
    :param line: the line of the csv file
    :return: a set of email addresses
    """
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [None]:

# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)

# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]

# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))

# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

In [None]:
emails_df.isnull().sum()

In [None]:
#emails_df find null rows
emails_df.loc[emails_df['To'].isnull()]

In [None]:
# remove rows with null values
emails_df = emails_df.dropna()

In [None]:
emails_df.isnull().sum()


In [None]:
emails_df.shape

In [None]:
# save dataframe to file
emails_df.to_csv('./csv/emails_df.csv', index=False)

In [None]:
# open dataframe from file
emails_df_2 = pd.read_csv('./csv/emails_df.csv')

In [None]:
emails_df_2.head()