## Creation of a corpus based on the body of our CSV.

In [2]:
#import libraries 
import numpy as np
import pandas as pd
from pathlib import Path


In [4]:
df = pd.read_csv(r"data\v2.csv")

In [5]:
df.shape

(128103, 8)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128103 entries, 0 to 128102
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  128103 non-null  int64 
 1   id_mail     128103 non-null  object
 2   date        128103 non-null  object
 3   from        128103 non-null  object
 4   to          125357 non-null  object
 5   subject     123580 non-null  object
 6   body        128103 non-null  object
 7   x_origin    128103 non-null  object
dtypes: int64(1), object(7)
memory usage: 7.8+ MB


In [10]:
df["body"].head(10)

0    ---------------------- Forwarded by Phillip K ...
1    ---------------------- Forwarded by Phillip K ...
2    Dave, \n\n Here are the names of the west desk...
3             Paula,\n\n 35 million is fine\n\nPhillip
4    ---------------------- Forwarded by Phillip K ...
5    ---------------------- Forwarded by Phillip K ...
6    ---------------------- Forwarded by Phillip K ...
7    Brenda,\n\nPlease use the second check as the ...
8    I think Fletch has a good CPA.  I am still doi...
9    Brenda,\n\n Please use the second check as my ...
Name: body, dtype: object

In [42]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alebe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alebe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alebe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [43]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer
import re

### 1. First approach using regex and NLTK.

In [31]:
def clean(text):
    stop = set(stopwords.words('english'))
    stop.update(("to","cc","subject","http","from","sent"))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    # porter= PorterStemmer()
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    # stem = " ".join(porter.stem(token) for token in normalized.split())
    
    return normalized

In [26]:
body_cleaned = []
for body in df["body"]:
    body_cleaned.append(clean(body).split())


In [28]:
len(body_cleaned)

128103

In [32]:
#try with tokenization
body_token = []
for body in df["body"]:
    body_token.append(clean(body).split())
body_token[0]

In [118]:
#remove small words (len lower than 3)
for list in body_cleaned:
    for word in list:
        if len(word) <= 3:
            list.remove(word)


In [119]:
body_cleaned[100]

['keep',
 'getting',
 'security',
 'request',
 'cannot',
 'approve',
 'please',
 'take',
 'care',
 'phillip',
 'forwarded',
 'phillip',
 'allen',
 'arsystem',
 'enron',
 'phillip',
 'allen',
 'enron',
 'request',
 'submitted',
 'access',
 'request',
 'frank',
 'ermis',
 'enron',
 'please',
 'review',
 'upon',
 'request',
 'received',
 'email',
 'requester',
 'specified',
 'manager',
 'please',
 'click',
 'itcapps',
 'corp',
 'enron',
 'srrs',
 'approve',
 'detail',
 'email',
 'phillip',
 'allen',
 'enron',
 'approve',
 'request',
 'request',
 'create',
 'date',
 'requested',
 'frank',
 'ermis',
 'enron',
 'resource',
 'name',
 'market',
 'data',
 'telerate',
 'basic',
 'energy',
 'resource',
 'type',
 'application']

In [120]:
len(body_cleaned[0])

462

In [121]:
len(body_cleaned)

128103

In [122]:
df['body_clean'] = body_cleaned

In [125]:
df['body_clean'].head(10)

0    [forwarded, phillip, allen, richard, burchfiel...
1    [forwarded, phillip, allen, richard, burchfiel...
2    [dave, name, west, desk, member, category, ori...
3                      [paula, million, fine, phillip]
4    [forwarded, phillip, allen, enron, north, amer...
5    [forwarded, phillip, allen, george, richards, ...
6    [forwarded, phillip, allen, nancy, hall, enron...
7    [brenda, please, second, check, october, payme...
8                         [think, fletch, good, still]
9    [brenda, please, second, check, october, payme...
Name: body_clean, dtype: object

In [126]:
#calculate the frequency for each email
from nltk import FreqDist
frequency = nltk.FreqDist(body_cleaned[0])
print(frequency.most_common(15))

[('position', 33), ('deal', 11), ('project', 9), ('manager', 8), ('ability', 8), ('consolidated', 7), ('issue', 7), ('physical', 7), ('need', 7), ('requirement', 6), ('meeting', 6), ('trader', 6), ('option', 6), ('list', 5), ('screen', 5)]


In [127]:
most_common = []
for item in body_cleaned:
    frequency = nltk.FreqDist(item)
    most_common.append(frequency.most_common(15))

[[('position', 33),
  ('deal', 11),
  ('project', 9),
  ('manager', 8),
  ('ability', 8),
  ('consolidated', 7),
  ('issue', 7),
  ('physical', 7),
  ('need', 7),
  ('requirement', 6),
  ('meeting', 6),
  ('trader', 6),
  ('option', 6),
  ('list', 5),
  ('screen', 5)],
 [('position', 33),
  ('deal', 11),
  ('project', 9),
  ('manager', 8),
  ('ability', 8),
  ('consolidated', 7),
  ('issue', 7),
  ('physical', 7),
  ('need', 7),
  ('requirement', 6),
  ('meeting', 6),
  ('trader', 6),
  ('option', 6),
  ('list', 5),
  ('screen', 5)],
 [('dave', 1),
  ('name', 1),
  ('west', 1),
  ('desk', 1),
  ('member', 1),
  ('category', 1),
  ('origination', 1),
  ('side', 1),
  ('sparse', 1),
  ('phillip', 1)],
 [('paula', 1), ('million', 1), ('fine', 1), ('phillip', 1)],
 [('enron', 9),
  ('ect', 5),
  ('corp', 3),
  ('meeting', 3),
  ('phillip', 2),
  ('allen', 2),
  ('kimberly', 2),
  ('reporting', 2),
  ('resource', 2),
  ('please', 2),
  ('forwarded', 1),
  ('north', 1),
  ('america', 1),
  (

In [129]:
df['most_common_15'] = most_common

In [131]:
df.to_csv('data/v3.csv')

### 2. Second approach using word_tokenize

In [107]:
##another method-test (not used for now)
# nltk.download('punkt')
# stop_words = set(stopwords.words('english'))
# token_words = word_tokenize(df['body'][0])
# filtered_words = [w for w in token_words if not w.lower() in stop_words]
# filtered_words = [w.lower() for w in filtered_words]
# filtered_words = [w for w in filtered_words if len(w) > 3]
# filtered_words = [w for w in filtered_words if not w.isdigit()]
# filtered_words = " ".join([w for w in filtered_words if w not in set(string.punctuation)])
# filtered_words = filtered_words.split()
# for word in filtered_words:
#     if word.isdigit():
#         filtered_words.remove(word)
#     if not word.isascii():
#         filtered_words.remove(word)
# filtered_words = [re.sub(r'[^a-zA-Z]', ' ',word) for word in filtered_words]
# filtered_words = [w for w in filtered_words if w not in set(string.punctuation)]
# filtered_words.split(" ")

['forwarded',
 'phillip',
 'allen hou ect',
 '          ',
 '     ',
 'richard',
 'burchfield',
 '          ',
 '     ',
 'phillip',
 'allen hou ect',
 'beth',
 'perlman hou ect',
 'subject',
 'consolidated',
 'positions',
 'issues',
 'list',
 'phillip',
 'issues',
 'list',
 'forward',
 'documenting',
 'requirements',
 'consolidated',
 'physical financial',
 'positions',
 'transport',
 'trade',
 'capture',
 'need',
 'focus',
 'first',
 'bullet',
 'allan',
 'list',
 'need',
 'single',
 'requirements',
 'although',
 'meeting',
 'keith',
 'wednesday',
 'informative',
 'solution',
 'creating',
 'infinitely',
 'dynamic',
 'consolidated',
 'position',
 'screen',
 'extremely',
 'difficult',
 'time',
 'consuming',
 'throughout',
 'meeting',
 'wednesday',
 'keith',
 'alluded',
 'inability',
 'consensus',
 'amongst',
 'traders',
 'presentation',
 'consolidated',
 'position',
 'solution',
 'make',
 'trader',
 'arrange',
 'position',
 'screen',
 'liking',
 'much',
 'like',
 'excel',
 'needs',
 'ha