In [1]:
from datetime import date
from dateutil.parser import parse
import email
import matplotlib.image as mpimg
import networkx as nx
import numpy as np
import os
import re
import sys
import os

In [2]:
import mailbox

# Constants to change
PROJECT_PATH = 'enron'

DIRNAME = os.path.dirname(os.getcwd())+os.sep+PROJECT_PATH+os.sep
MBOX_PATH = DIRNAME+'enron.mbox'
DATA = DIRNAME+'maildir'+os.sep
SAMPLE = DIRNAME+'sample'+os.sep+'1'+os.sep
MBOX = mailbox.mbox(MBOX_PATH)

In [3]:
def maildir_to_mbox():
    with open(MBOX_PATH, 'w') as mbox:   
        for (root, dirs, file_names) in os.walk(DATA):
            if root.split(os.sep)[-1].lower() != 'inbox':
                continue

            # Process each message in 'inbox'
            for file_name in file_names:
                file_path = os.path.join(root, file_name)
                message_text = open(file_path).read()
                msg = email.message_from_string(message_text)
                mbox.write(msg.as_string(unixfrom=True) + "\n\n")

    return mbox

In [None]:
maildir_to_mbox()

In [None]:
"""Sample creation."""
import os
import random
import shutil
import numpy


def create_sample(sample_size, number_of_sample):
    """Create a sample of emails among all the emails."""
    print(f"Creation of {number_of_sample} samples each contening {sample_size} emails.")

# Definition of the different directories.

    dirname = os.getcwd()
    mailDir = os.path.join(dirname, 'maildir')

# Counting emails.

    mails_count = sum(len(files) for _, _, files in os.walk(mailDir))
    print(f"The maildir folder contains {mails_count} emails.")

# Cleaning the target file.

    shutil.rmtree(os.path.join(dirname, 'sample'))

    for sample in range(1, number_of_sample+1):
        print(f"Sample number {sample}.")
        sampleDir = os.path.join(dirname, 'sample', str(sample))

        if not os.path.exists(sampleDir):
            os.makedirs(sampleDir)

        # Draw random emails and copy them to the sample folder.
        random_list = numpy.random.randint(1, mails_count+1, sample_size)
        print(len(random_list))
        id_mail = 1
        for repertory, sub_repertory, files in os.walk(mailDir):
            for f in files:
                if id_mail in random_list:
                    shutil.copy(os.path.join(repertory, f), sampleDir)
                    os.rename(os.path.join(sampleDir, f), os.path.join(
                        sampleDir, str(id_mail)))
                id_mail += 1
        print(f"Creation of the sample {sample} successfully completed.")
        print(
            f"{sample_size} random mails have been copied to target repertory."
            )

    print(
        f"OK - {number_of_sample} samples of {sample_size} emails created in "
        "target repertory.")


create_sample(10000, 1)

In [None]:
def sample_to_mbox():
    with open(MBOX, 'w') as mbox:   
        for (root, dirs, file_names) in os.walk(SAMPLE):
            # Process each message 
            for file_name in file_names:
                file_path = os.path.join(root, file_name)
                message_text = open(file_path).read()
                msg = email.message_from_string(message_text)
                mbox.write(msg.as_string(unixfrom=True) + "\n\n")

    return mbox

In [None]:
sample_to_mbox()

In [None]:
def data_cleaning(df):
    """Clean the dataframe.

    Args:
        df (dict): dictionnary to clean

    Returns:
        dict: Enron corpus cleaned dataframe

    """
    cols_to_keep = ['Date', 'From', 'To', 'Cc', 'Bcc', 'Subject', 'Body']
    df = df[cols_to_keep]

    df = df.drop_duplicates()
    df = df.drop_duplicates(subset=['Date', 'From', 'To'])
    df.dropna(subset = ["Date"], inplace=True)

    return df

In [None]:
import pandas as pd

def mbox_to_df():
    """Convert the mailbox to a dataframe.

    Returns:
        dict: Enron corpus sample dataframe

    """
    mbox_dict = {}

    for i, msg in enumerate(MBOX):
        mbox_dict[i] = {}
        for header in msg.keys():
            mbox_dict[i][header] = msg[header]

        mbox_dict[i]['Body'] = msg.get_payload().replace('\n', ' ').replace('\t', ' ').strip()

    df = pd.DataFrame.from_dict(mbox_dict, orient='index')
    df = data_cleaning(df)
    df['Date'] = pd.to_datetime(df['Date'])

    df.to_csv('emails.csv', sep='|')
    # df.to_csv('sample.csv', sep='|')

    return df

In [None]:
df = mbox_to_df()

In [None]:
import pandas as pd

# Complete dataframe
df = pd.read_csv('emails.csv', sep='|')

In [6]:
import pandas as pd

# Sampled dataframe
df = pd.read_csv('sample.csv', sep='|')

In [None]:
df['Date'] = pd.to_datetime(df['Date'], utc=True)
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Day'] = df['Date'].dt.dayofweek

In [None]:
import matplotlib.pyplot as plt

# reduce the sample period
# looks like the total number of emails really ramped up in 2000 and 2001
indices = (df['Year'] >= 1997) & (df['Year'] <= 2004)
plt.figure(figsize = (10,6))
figure1 = df.loc[indices].groupby('Year')['Body'].count().plot(title='Network messages traffic between 1997 and 2004')

In [None]:
# reduce the sample period
# looks like the total number of emails really ramped up in october 2001 (date of bankruptcy)
indices = (df['Year'] >= 2000) & (df['Year'] <= 2001)
plt.figure(figsize = (10,6))
df['month_year'] = pd.to_datetime(df['Date'], utc=True).dt.to_period('M')
figure2 = df.loc[indices].groupby('month_year')['Body'].count().plot(kind='bar', title='Network messages traffic between 2000 and 2001')

In [None]:
df = df[(df['Year'] >= 2000) & (df['Year'] <= 2001)]

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

#Create the graph
G_symmetric = nx.Graph()

for index, mail in df.iterrows():
    mail_to=mail[2].split(sep=",")

    for adress in mail_to:
        # --- Tester si le node existe et si il existe ajouter +1 au weight
        G_symmetric.add_edge(mail[1],adress,weight=1)

pos = nx.spring_layout(G_symmetric)
betCent = nx.betweenness_centrality(G_symmetric, normalized=True, endpoints=True)
node_color = [20000.0 * G_symmetric.degree(v) for v in G_symmetric]
node_size =  [v * 10000 for v in betCent.values()]
plt.figure(figsize=(50, 50))
nx.draw_networkx(G_symmetric, pos=pos, with_labels=True,
                 node_color=node_color,
                 node_size=node_size )

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.cluster import KMeans

stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com'])
vectorizer = TfidfVectorizer(stop_words=stopwords)

# Keep only body message
cols_to_keep = ['Body']
df = df[cols_to_keep]

X = vectorizer.fit_transform(df['Body'].values.astype(str))
terms = vectorizer.get_feature_names()

model = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]

In [None]:
for i in range(3):
    print('Cluster ', i)
    for ind in order_centroids[i, :100]:
        print(terms[ind])
    
    print()

In [None]:
print()
print('Prediction')
email_to_test = 'Enter email here'
X3 = vectorizer.transform([email_to_test])
predicted = model.predict(X3)
print(predicted)

In [None]:
# For all emails
X2 = vectorizer.fit_transform(df['Body'].values.astype(str))

true_k = 2
model = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1)
model.fit(X2)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(3):
    print('Cluster ', i)
    for ind in order_centroids[i, :100]:
        print(terms[ind])

    print()

In [None]:
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
import nltk
nltk.download('stopwords')
import csv

csv.field_size_limit(100000000)

with open('sample.csv', 'r') as csvfile :
    mail = csv.reader(csvfile, delimiter='|')
    tokens=''
    for message in mail:
        tokens=tokens + message[6]

tokenizer = nltk.RegexpTokenizer(r"[a-zA-Z]{3,}")
tokenized_messages = [t.lower() for t in tokenizer.tokenize(tokens)]
stopwords = nltk.corpus.stopwords.words('english')
filtered_tokenized_messages = [word for word in tokenized_messages if word not in stopwords]
without_words=['hour', 'codesite', 'hourahead', 'date', 'request', 'access', 'start']
filtered_tokenized_messages = [word for word in filtered_tokenized_messages if word not in without_words]
fdist = FreqDist(filtered_tokenized_messages)
common = fdist.most_common(100)
print(fdist)
print(common)

trigram_collocation = TrigramCollocationFinder.from_words(filtered_tokenized_messages)
trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 100)


import pandas_profiling
import pandas as pd


def wordFromCSV(src, word):
    """Create a dataframe of the mails containing the target word.
    
    Args:
        src(csv): source of mails to explore.
        word(str): target word to find.

    Returns:
        df:dataframe
    """
    csv.field_size_limit(100000000)
    data=[]
    target=word.lower()

    with open(src, 'r') as csvfile:
        mails = csv.reader(csvfile, delimiter='|')
        for message in mails:
            body_message=message[7].lower()
            if body_message.__contains__(target)==True:
                temp=[
                    message[1],
                    message[2],
                    message[3],
                    message[4],
                    message[5],
                    message[6],
                    message[7]]
                    
                data.append(temp)
                df = pd.DataFrame(data, columns=['Date',  'From',  'To', 'Cc',  'Bcc',  'Subject', 'Body']) 

    return df


df = wordFromCSV('sample.csv', 'work')
print("Le fichier a " + str(df.shape[0]) + " lignes et " + str(df.shape[1]) + " colonnes")

pandas_profiling.ProfileReport(df)