In [2]:
import pandas as pd
from email.parser import Parser
from pathlib import Path

In [3]:
import email

In [4]:
data = {'id_mail': [],
        'date': [],
        'from': [],
        'to': [],
        'subject': [],
        'body': [],
        'x_origin': [],
        }
for path in Path('data').rglob('*/all_documents/*'):
    with open(path, mode='r', encoding="utf-8") as file:
        # print(path, "\n\n***")
        
        raw_email = Parser().parse(file)
        # print(path, "\n\n***")
        # print(raw_email)
        data["id_mail"].append(raw_email.get('Message-ID'))
        data["x_origin"].append(raw_email.get('X-Origin'))
        data["from"].append(raw_email.get('From'))
        data["to"].append(raw_email.get('To'))
        data["subject"].append(raw_email.get('Subject'))
        data["date"].append(raw_email.get('Date'))
        data["body"].append(raw_email.get_payload())
        # print("*"*20)

In [5]:
df = pd.DataFrame(data)

In [6]:
df.head()

Unnamed: 0,id_mail,date,from,to,subject,body,x_origin
0,<28040030.1075840228655.JavaMail.evans@thyme>,"Fri, 8 Dec 2000 07:49:00 -0800 (PST)",ralph.blakemore@enron.com,"kenneth.lay@enron.com, jeff.skilling@enron.com",2000 Chairman's Award,"Gentlemen,\n\nThank you for the letter memoria...",LAY-K
1,<30356390.1075840228680.JavaMail.evans@thyme>,"Mon, 11 Dec 2000 01:52:00 -0800 (PST)",barbara.paige@enron.com,"samantha.bryce@enron.com, adriana.cortes@enron...",Field Study Program,A description of the proposed Field Study Prog...,LAY-K
2,<4711296.1075840228704.JavaMail.evans@thyme>,"Mon, 11 Dec 2000 09:02:00 -0800 (PST)",william.ramsay@iea.org,kenneth.lay@enron.com,RE: IEA Ministerial/OECD Ministerial,Rosalee: Thank you for this good news. I wil...,LAY-K
3,<27851547.1075840228727.JavaMail.evans@thyme>,"Mon, 11 Dec 2000 02:49:00 -0800 (PST)",rosalee.fleming@enron.com,shea_dugger@i2.com,Re: Final Eagle BOD Presentation,Here it is!! Have a great day!! Stay warm.\n...,LAY-K
4,<16638165.1075840228750.JavaMail.evans@thyme>,"Mon, 11 Dec 2000 04:01:00 -0800 (PST)",nancy@newcapitolsolutions.com,kenneth.lay@enron.com,Richard's Resume,"Dear Ken,\n\nMy son, Richard, recently sold hi...",LAY-K


In [7]:
df.shape

(1961, 7)

In [8]:
df.to_csv('data/v1.csv')

In [9]:
df.dtypes

id_mail     object
date        object
from        object
to          object
subject     object
body        object
x_origin    object
dtype: object

In [10]:
# a sample email
print(df.loc[0]['body'])

Gentlemen,

Thank you for the letter memorializing my nomination for the 2000 Chairman's 
Award.  It is a privilege to be a part of the Enron organization.  I have 
received many promotions, cash bonuses and commendations during my career but 
I consider the recognition as a nominee for the 2000 Chairman's Award to be 
the most significant acknowledgment of my performance ever received.  My 
compliments to you for making Enron one of the most successful and excellent 
companies on the globe.  I only wish that I could contribute more to Enron's 
growth and continued success.

Sincerely,

RW Blakemore
Enron Wind Energy Systems Corp.
Center For Advanced Technology 


In [11]:
# get the date
print(df.loc[0]['date'])

Fri, 8 Dec 2000 07:49:00 -0800 (PST)


In [12]:
# Employ names
df['x_origin'][:10]

0    LAY-K
1    LAY-K
2    LAY-K
3    LAY-K
4    LAY-K
5    LAY-K
6    LAY-K
7    LAY-K
8    LAY-K
9    LAY-K
Name: x_origin, dtype: object

In [13]:
# Higest email sender
top = pd.DataFrame(df['x_origin'].value_counts()[:2])
top.reset_index(inplace=True)
top.columns = ["x_origin", "Counts"]
top

Unnamed: 0,x_origin,Counts
0,LAY-K,1127
1,SKILLING-J,834


### Cleaning dataset

In [14]:
# cleaning data column
import datetime
from dateutil import parser

# this is sample example
x = parser.parse("Fri, 4 May 2001 13:51:00 -0700 (PDT)")
print(x.strftime("%d-%m-%Y %H:%M:%S"))

04-05-2001 13:51:00


##### -date format tranformation

In [15]:
def change_type(dates):
    column = []
    
    for date in dates:
        column.append(parser.parse(date).strftime("%d-%m-%Y %H:%M:%S"))
    return column

df['date'] = change_type(df['date'])
df.head(2)

Unnamed: 0,id_mail,date,from,to,subject,body,x_origin
0,<28040030.1075840228655.JavaMail.evans@thyme>,08-12-2000 07:49:00,ralph.blakemore@enron.com,"kenneth.lay@enron.com, jeff.skilling@enron.com",2000 Chairman's Award,"Gentlemen,\n\nThank you for the letter memoria...",LAY-K
1,<30356390.1075840228680.JavaMail.evans@thyme>,11-12-2000 01:52:00,barbara.paige@enron.com,"samantha.bryce@enron.com, adriana.cortes@enron...",Field Study Program,A description of the proposed Field Study Prog...,LAY-K


##### - Missing values

In [16]:
df.isnull().sum()


id_mail      0
date         0
from         0
to          81
subject      0
body         0
x_origin     0
dtype: int64

In [1]:
# # Most common words in the email
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords

# words = df['body']

# stop_words = set(stopwords.words('english'))
# word_tokens = word_tokenize(words)
# useful_words = [w for w in words_tokens if w not in stop_words]

# frequency = nltk.FreqDist(useful_words)

# # print(frequency.most_common(100))

In [17]:
# removing stopwords  sample
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

print(stopwords.words('english'))
 
# example_sent = """This is a sample sentence,
#                   showing off the stop words filtration."""
 
# stop_words = set(stopwords.words('english'))
 
# word_tokens = word_tokenize(example_sent)
 
# filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
 
# filtered_sentence = []
 
# for w in word_tokens:
#     if w not in stop_words:
#         filtered_sentence.append(w)
 
# print(word_tokens)
# print(filtered_sentence)



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [2]:
def clean(text):
    stop = set(stopwords.words('english'))
    stop.update(("to","cc","subject","http","from","sent",))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    porter= PorterStemmer()
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    #stem = " ".join(porter.stem(token) for token in normalized.split())
    
    return normalized

