# Install Dependencies

In [3]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd

In [5]:
import numpy as np

# Read CSV

In [6]:
df = pd.read_csv('~/Desktop/project/emails.csv')

In [7]:
df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


# Clean Data

In [8]:
df['message'].iloc[1]

"Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>\nDate: Fri, 4 May 2001 13:51:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: john.lavorato@enron.com\nSubject: Re:\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nTraveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what 

In [9]:
def split_message(message):
    parts = message.split("\n\n", 1)
    if len(parts) == 2:
        return parts
    else:
        return [parts[0], '']

x = pd.DataFrame()
x[['headers', 'body']] = df['message'].apply(split_message).apply(pd.Series)

In [10]:
x

Unnamed: 0,headers,body
0,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n
1,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...
2,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!
3,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s..."
4,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.
...,...,...
517396,Message-ID: <26807948.1075842029936.JavaMail.e...,This is a trade with OIL-SPEC-HEDGE-NG (John L...
517397,Message-ID: <25835861.1075842029959.JavaMail.e...,Some of my position is with the Alberta Term b...
517398,Message-ID: <28979867.1075842029988.JavaMail.e...,2\n\n -----Original Message-----\nFrom: \tDouc...
517399,Message-ID: <22052556.1075842030013.JavaMail.e...,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...


In [11]:
header_lines = x['headers'].str.split("\n")

In [12]:
header_lines

0         [Message-ID: <18782981.1075855378110.JavaMail....
1         [Message-ID: <15464986.1075855378456.JavaMail....
2         [Message-ID: <24216240.1075855687451.JavaMail....
3         [Message-ID: <13505866.1075863688222.JavaMail....
4         [Message-ID: <30922949.1075863688243.JavaMail....
                                ...                        
517396    [Message-ID: <26807948.1075842029936.JavaMail....
517397    [Message-ID: <25835861.1075842029959.JavaMail....
517398    [Message-ID: <28979867.1075842029988.JavaMail....
517399    [Message-ID: <22052556.1075842030013.JavaMail....
517400    [Message-ID: <28618979.1075842030037.JavaMail....
Name: headers, Length: 517401, dtype: object

In [13]:
header_lines[0]

['Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>',
 'Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)',
 'From: phillip.allen@enron.com',
 'To: tim.belden@enron.com',
 'Subject: ',
 'Mime-Version: 1.0',
 'Content-Type: text/plain; charset=us-ascii',
 'Content-Transfer-Encoding: 7bit',
 'X-From: Phillip K Allen',
 'X-To: Tim Belden <Tim Belden/Enron@EnronXGate>',
 'X-cc: ',
 'X-bcc: ',
 "X-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail",
 'X-Origin: Allen-P',
 'X-FileName: pallen (Non-Privileged).pst']

In [14]:
def extract_headers(message):
    desired_keys = ['Message-ID', 'Date', 'From', 'To', 'Subject', 'Mime-Version', 'Content-Type', 
                    'Content-Transfer-Encoding', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Folder', 
                    'X-Origin', 'X-FileName']
    
    header_dict = {}
    for line in message:
        if ": " in line:
            key, value = line.split(": ", 1)
            if key in desired_keys:
                header_dict[key] = value

    for key in desired_keys:
        if key == "Subject" and len(value) < 2:
            header_dict[key] = None
        if key not in header_dict:
            header_dict[key] = None
            
    return pd.Series(header_dict)

headers_df = header_lines.apply(extract_headers)

In [15]:
headers_df.head()

Unnamed: 0,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName
0,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst
1,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst
2,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf
3,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf
4,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf


In [16]:
headers_df.columns

Index(['Message-ID', 'Date', 'From', 'To', 'Subject', 'Mime-Version',
       'Content-Type', 'Content-Transfer-Encoding', 'X-From', 'X-To', 'X-cc',
       'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName'],
      dtype='object')

In [17]:
headers_needed = ['Message-ID', 'Date', 'From', 'To', 'Subject']
emailMetaData = headers_df[headers_needed].copy()

In [18]:
emailMetaData

Unnamed: 0,Message-ID,Date,From,To,Subject
0,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,
1,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:
2,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test
3,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,
4,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello
...,...,...,...,...,...
517396,<26807948.1075842029936.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",john.zufferli@enron.com,kori.loibl@enron.com,Trade with John Lavorato
517397,<25835861.1075842029959.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",john.zufferli@enron.com,john.lavorato@enron.com,Gas Hedges
517398,<28979867.1075842029988.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",john.zufferli@enron.com,dawn.doucet@enron.com,RE: CONFIDENTIAL
517399,<22052556.1075842030013.JavaMail.evans@thyme>,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",john.zufferli@enron.com,jeanie.slone@enron.com,Calgary Analyst/Associate


In [19]:
x['body']

0                                 Here is our forecast\n\n 
1         Traveling to have a business meeting takes the...
2                            test successful.  way to go!!!
3         Randy,\n\n Can you send me a schedule of the s...
4                       Let's shoot for Tuesday at 11:45.  
                                ...                        
517396    This is a trade with OIL-SPEC-HEDGE-NG (John L...
517397    Some of my position is with the Alberta Term b...
517398    2\n\n -----Original Message-----\nFrom: \tDouc...
517399    Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...
517400    i think the YMCA has a class that is for peopl...
Name: body, Length: 517401, dtype: object

In [20]:
x['body'].loc[517398] # noisy data example

"2\n\n -----Original Message-----\nFrom: \tDoucet, Dawn  \nSent:\tWednesday, November 28, 2001 8:17 AM\nTo:\tZufferli, John\nSubject:\tCONFIDENTIAL\n\nMorning John,\nI'm still working on the mini-PRC for Lavo.  Sean Lalani has not yet been ranked and rumour has it that he reports to you now.  Can you confirm and send me a number.  Thanks!"

In [21]:
type(x['body'])

pandas.core.series.Series

In [22]:
import re

def clean_text(text):
    text = re.sub(r'(From:.*?\n)|(Sent:.*?\n)|(To:.*?\n)|(Subject:.*?\n)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text.strip()

cleanedBody = x['body'].apply(clean_text)

In [23]:
cleanedBody

0                                      here is our forecast
1         traveling to have a business meeting takes the...
2                                 test successful way to go
3         randy can you send me a schedule of the salary...
4                            lets shoot for tuesday at 1145
                                ...                        
517396    this is a trade with oilspechedgeng john lavor...
517397    some of my position is with the alberta term b...
517398    2 original message morning john im still worki...
517399    analyst rank stephane brodeur 1 chad clark 1 i...
517400    i think the ymca has a class that is for peopl...
Name: body, Length: 517401, dtype: object

In [24]:
emailMetaData['Body'] = cleanedBody

In [25]:
emailMetaData

Unnamed: 0,Message-ID,Date,From,To,Subject,Body
0,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,here is our forecast
1,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,traveling to have a business meeting takes the...
2,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,test successful way to go
3,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,randy can you send me a schedule of the salary...
4,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,lets shoot for tuesday at 1145
...,...,...,...,...,...,...
517396,<26807948.1075842029936.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",john.zufferli@enron.com,kori.loibl@enron.com,Trade with John Lavorato,this is a trade with oilspechedgeng john lavor...
517397,<25835861.1075842029959.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",john.zufferli@enron.com,john.lavorato@enron.com,Gas Hedges,some of my position is with the alberta term b...
517398,<28979867.1075842029988.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",john.zufferli@enron.com,dawn.doucet@enron.com,RE: CONFIDENTIAL,2 original message morning john im still worki...
517399,<22052556.1075842030013.JavaMail.evans@thyme>,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",john.zufferli@enron.com,jeanie.slone@enron.com,Calgary Analyst/Associate,analyst rank stephane brodeur 1 chad clark 1 i...


### This Dataset is Ready to be used. # SUEEE!!!!
# Preprocessing

In [26]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/vinaysinghal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vinaysinghal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/vinaysinghal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
def tokenizeAndRemoveStopWords(text):
    tokenizer = nltk.tokenize.word_tokenize
    stopWords = set(stopwords.words('english'))
    tokens = tokenizer(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token not in stopWords]
    return tokens

def processTextBatch(texts):
    if not len(texts):
        return
    textLower = texts.apply(lambda x: x.lower() if len(x))
    textTokens = textLower.apply(tokenizeAndRemoveStopWords)
    lemmatizer = WordNetLemmatizer()
    textLemmetized = textTokens.apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
    return textLemmetized


In [28]:
# Optimized lemmetization for less kernel burnout
# Body
batchSize = 560
batchNum = len(emailMetaData) // batchSize + 1

processed_tokens_body = []
for i in range(batchNum):
    start_idx = i * batchSize
    end_idx = min((i + 1) * batchSize, len(emailMetaData))
    batch_texts = emailMetaData['Body'].iloc[start_idx:end_idx]
    batch_tokens = processTextBatch(batch_texts)
    processed_tokens_body.extend(batch_tokens)

In [29]:
# Optimized lemmetization for less kernel burnout
# Subject
batchSize = 2000
batchNum = len(emailMetaData) // batchSize + 1

processed_tokens_subject = []
for i in range(batchNum):
    start_idx = i * batchSize
    end_idx = min((i + 1) * batchSize, len(emailMetaData))
    batch_texts = emailMetaData['Subject'].iloc[start_idx:end_idx]
    batch_tokens = processTextBatch(batch_texts)
    processed_tokens_subject.extend(batch_tokens)

AttributeError: 'NoneType' object has no attribute 'lower'

In [None]:
emailMetaData.columns

Index(['Message-ID', 'Date', 'From', 'To', 'Subject', 'Body'], dtype='object')

In [None]:
flat_tokens_body = [token for sublist in processed_tokens_body for token in sublist]

In [None]:
flat_tokens_subject = [token for sublist in processed_tokens_subject for token in sublist]

In [None]:
processed_tokens_body[:10]

[['forecast'],
 ['traveling',
  'business',
  'meeting',
  'take',
  'fun',
  'trip',
  'especially',
  'prepare',
  'presentation',
  'would',
  'suggest',
  'holding',
  'business',
  'plan',
  'meeting',
  'take',
  'trip',
  'without',
  'formal',
  'business',
  'meeting',
  'would',
  'even',
  'try',
  'get',
  'honest',
  'opinion',
  'whether',
  'trip',
  'even',
  'desired',
  'necessary',
  'far',
  'business',
  'meeting',
  'think',
  'would',
  'productive',
  'try',
  'stimulate',
  'discussion',
  'across',
  'different',
  'group',
  'working',
  'often',
  'presenter',
  'speaks',
  'others',
  'quiet',
  'waiting',
  'turn',
  'meeting',
  'might',
  'better',
  'held',
  'round',
  'table',
  'discussion',
  'format',
  'suggestion',
  'go',
  'austin',
  'play',
  'golf',
  'rent',
  'ski',
  'boat',
  'jet',
  'ski',
  'flying',
  'somewhere',
  'take',
  'much',
  'time'],
 ['test', 'successful', 'way', 'go'],
 ['randy',
  'send',
  'schedule',
  'salary',
  'le

In [None]:
flat_tokens_body[:10]

['forecast',
 'traveling',
 'business',
 'meeting',
 'take',
 'fun',
 'trip',
 'especially',
 'prepare',
 'presentation',
 'would',
 'suggest',
 'holding',
 'business',
 'plan',
 'meeting',
 'take',
 'trip',
 'without',
 'formal',
 'business',
 'meeting',
 'would',
 'even',
 'try',
 'get',
 'honest',
 'opinion',
 'whether',
 'trip',
 'even',
 'desired',
 'necessary',
 'far',
 'business',
 'meeting',
 'think',
 'would',
 'productive',
 'try',
 'stimulate',
 'discussion',
 'across',
 'different',
 'group',
 'working',
 'often',
 'presenter',
 'speaks',
 'others',
 'quiet',
 'waiting',
 'turn',
 'meeting',
 'might',
 'better',
 'held',
 'round',
 'table',
 'discussion',
 'format',
 'suggestion',
 'go',
 'austin',
 'play',
 'golf',
 'rent',
 'ski',
 'boat',
 'jet',
 'ski',
 'flying',
 'somewhere',
 'take',
 'much',
 'time',
 'test',
 'successful',
 'way',
 'go',
 'randy',
 'send',
 'schedule',
 'salary',
 'level',
 'everyone',
 'scheduling',
 'group',
 'plus',
 'thought',
 'change',
 'need

## Word Vectorization

In [None]:
emailText = [''.join(tokens) for tokens in flat_tokens_body]

In [None]:
emailSubject = [[''.join(tokens) for tokens in flat_tokens_subject]]

In [None]:
emailText[:10]

['forecast',
 'traveling',
 'business',
 'meeting',
 'take',
 'fun',
 'trip',
 'especially',
 'prepare',
 'presentation']

In [None]:
emailSubject[:10]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# vectorization (max_df is remove words occuring more than 80% of times to increase efficiency of vectors exlcuding words coming too frequcntly)
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', max_df=0.8)

In [None]:
# Fit: Learns the vocabulary of the corpus (all unique words or terms) and computes the inverse document frequency (IDF) for each term.
# Transform: Converts the input text data into TF-IDF vectors based on the learned vocabulary and IDF weights.
tfidf_sparse_body = tfidf_vectorizer.fit_transform(emailText)
tfidf_sparse_subject = tfidf_vectorizer.fit_transform(emailSubject)

In [None]:
tfidf_sparse_body[:10]

<10x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [None]:
# sparse array is array we created in one-hot encoding having many zeroes, optimizes spaces having zeroes (takes less space, computation)
# dense array is normal array, easier to work with but stores every value distinctly.
# tfidf_dense = tfidf_sparse.toarray()

In [None]:
emailMetaData.shape

TypeError: 'tuple' object is not callable

# Train Only: Create a small dataset of emailMetaData dataframe to provide to GPT-4o to get labelled data. This is needed for semi-supervised approach and use Label-Spreading to label the whole dataset accordingly. WHY? For better results. 


dfToLabel = emailMetaData.iloc[0:2000]