In [1]:
# Importing these packages
import pandas as pd
import random, re, email
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD

In [2]:
# Reading the data using Pandas
enron_data = pd.read_csv('./emails.csv')
enron_data

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...
517396,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...
517397,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...
517398,zufferli-j/sent_items/97.,Message-ID: <28979867.1075842029988.JavaMail.e...
517399,zufferli-j/sent_items/98.,Message-ID: <22052556.1075842030013.JavaMail.e...


In [3]:
# Filtering only the data which contains 'sent' in file column i.e _sent_mail, sent_mail, sent, etc. 
enron_sent = enron_data[enron_data["file"].str.contains('sent').tolist()]
print("Shape of enron_sent:",enron_sent.shape,"\n")

# Using regular expression(re) package to extract the names of the sender
enron_sent = enron_sent.assign(sender=enron_sent["file"].map(lambda x: re.search("([\w]+)-([\w]+)", x).group()).values)
enron_sent.drop("file", axis=1, inplace=True)
print(enron_sent["sender"].value_counts())

Shape of enron_sent: (126846, 2) 

mann-k          8926
kaminski-v      8644
dasovich-j      5366
germany-c       5128
shackleton-s    4407
                ... 
motley-m          13
meyers-a          11
linder-e           6
phanis-s           4
merriss-s          3
Name: sender, Length: 149, dtype: int64


In [4]:
# Taking the senders who have sent more than 1000 emails
enron_sent = enron_sent.groupby('sender').filter(lambda x : len(x) > 1000)
classes = len(enron_sent.groupby('sender'))
print("Number of classes:",classes,"\n")

# Mapping sender's names to use numbers as labels
senders = enron_sent["sender"].value_counts().head(classes).index.values
mapping = dict(zip(senders, range(classes)))
print("Mapping:",mapping,"\n")

print("Shape of enron_sent:",enron_sent.shape,"\n")

Number of classes: 37 

Mapping: {'mann-k': 0, 'kaminski-v': 1, 'dasovich-j': 2, 'germany-c': 3, 'shackleton-s': 4, 'jones-t': 5, 'bass-e': 6, 'lenhart-m': 7, 'beck-s': 8, 'symes-k': 9, 'scott-s': 10, 'taylor-m': 11, 'love-p': 12, 'arnold-j': 13, 'perlingiere-d': 14, 'nemec-g': 15, 'fossum-d': 16, 'sanders-r': 17, 'giron-d': 18, 'lavorato-j': 19, 'kean-s': 20, 'rogers-b': 21, 'delainey-d': 22, 'mcconnell-m': 23, 'farmer-d': 24, 'allen-p': 25, 'sager-e': 26, 'rodrique-r': 27, 'steffes-j': 28, 'stclair-c': 29, 'kitchen-l': 30, 'dorland-c': 31, 'cash-m': 32, 'haedicke-m': 33, 'neal-s': 34, 'shankman-j': 35, 'blair-l': 36} 

Shape of enron_sent: (93590, 2) 



In [5]:
# Printing a random email from the data
print(enron_sent.iloc[random.randint(0, enron_sent.shape[0]), 0])

Message-ID: <33483018.1075854160977.JavaMail.evans@thyme>
Date: Tue, 11 Jan 2000 02:57:00 -0800 (PST)
From: daren.farmer@enron.com
To: buylow@wt.net
Subject: Re: Marketing Services/Scheduling
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Daren J Farmer
X-To: "buylow" <buylow@wt.net>
X-cc: 
X-bcc: 
X-Folder: \Darren_Farmer_Dec2000\Notes Folders\Sent
X-Origin: Farmer-D
X-FileName: dfarmer.nsf

Ken, 
Thanks for your input.  I will definitely take this into consideration.

Also, I want you to know that I really do appreciate all that you've done for 
me.  I have greatly benefited from your knowledge of the market and industry; 
and also from your experience of just dealing with people and issues.  It has 
been very nice having someone to discuss ideas with and to learn from (and 
argue with!).  Thanks, also, for your honesty and openness.  I really 
appreciate you telling me when you disagree, it helps me see things from a 
different v

In [6]:
# Using email package to extract email artificials and content from raw text
def extract_email(raw_email):
    msg = email.message_from_string(raw_email)
    
    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(content)
    return result

enron_parsed = pd.DataFrame(list(map(extract_email, enron_sent.message)))
enron_parsed.head()

Unnamed: 0,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,Cc,Bcc
0,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,,
1,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,,
2,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,,
3,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",,
4,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,,


In [7]:
# Checking the parsed enron data
enron_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93590 entries, 0 to 93589
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Message-ID                 93590 non-null  object
 1   Date                       93590 non-null  object
 2   From                       93590 non-null  object
 3   To                         93338 non-null  object
 4   Subject                    93590 non-null  object
 5   Mime-Version               93590 non-null  object
 6   Content-Type               93590 non-null  object
 7   Content-Transfer-Encoding  93590 non-null  object
 8   X-From                     93590 non-null  object
 9   X-To                       93590 non-null  object
 10  X-cc                       93590 non-null  object
 11  X-bcc                      93590 non-null  object
 12  X-Folder                   93590 non-null  object
 13  X-Origin                   93590 non-null  object
 14  X-File

In [8]:
# Removing numbers and stopwords using nltk stopwords corpus
def content_to_wordlist( content, remove_stopwords=False ):
    content = re.sub("[^a-zA-Z]"," ", content)
    words = content.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return ' '.join(words)

data = pd.DataFrame(list(map(content_to_wordlist, 
                          enron_parsed[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))), 
                          columns = ["content"])

# Adding the mapped sender column to the data
data = data.assign(sender=enron_sent["sender"].values)
data = data.replace({'sender': mapping})
data

Unnamed: 0,content,sender
0,here is our forecast,25
1,re traveling to have a business meeting takes ...,25
2,re test test successful way to go,25
3,randy can you send me a schedule of the salary...,25
4,re hello let s shoot for tuesday at,25
...,...,...
93585,re faq legal here are my faq comments all of t...,11
93586,re candidates nedre we have interviewed four c...,11
93587,re enrononline com revised documents stanford ...,11
93588,re enrononline com revised documents mark here...,11


In [9]:
# Splitting the data into train and test sets using sklearn's train_test_split
train_features, test_features, train_label, test_label = train_test_split(data.content.values, data.sender.values, test_size=0.25)
print('Shape of train_features:',train_features.shape)
print('Shape of test_features:',test_features.shape)
print('Shape of train_label:',train_label.shape)
print('Shape of test_label:',test_label.shape)

Shape of train_features: (70192,)
Shape of test_features: (23398,)
Shape of train_label: (70192,)
Shape of test_label: (23398,)


In [10]:
# Text Vectorization using TfidfVectorizer package which transforms strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
train_features = vectorizer.fit_transform(train_features)
test_features = vectorizer.transform(test_features)

print('Shape of train_features after vectorization:',train_features.shape)
print('Shape of test_features after vectorization:',test_features.shape)

Shape of train_features after vectorization: (70192, 117184)
Shape of test_features after vectorization: (23398, 117184)


In [11]:
# Using LinearSVC funtion provided in the SVM package of sklearn
clf = svm.LinearSVC()

# Training the data
t0 = time()
clf.fit(train_features, train_label)
print ("Training time is:", round(time()-t0, 3), "seconds")

# Testing the model
t0 = time()
pred = clf.predict(test_features)
print ("Testing time is:", round(time()-t0, 3), "seconds")

acc = accuracy_score(pred, test_label)
print("Accuracy is:",acc)

Training time is: 13.002 seconds

Testing time is: 0.089 seconds

Accuracy is: 0.9319599965809043


In [12]:
c_values=[0.1,1,10]
for c_value in c_values:
    clf = svm.LinearSVC(C=c_value)
    print("When C =",c_value,)
    # Training the data
    t0 = time()
    clf.fit(train_features, train_label)
    print ("Training time is:", round(time()-t0, 3), "seconds")

    # Testing the model
    t0 = time()
    pred = clf.predict(test_features)
    print ("Testing time is:", round(time()-t0, 3), "seconds")

    acc = accuracy_score(pred, test_label)
    print("Accuracy is:",acc,"\n")

When C is 0.1
Training time is: 9.554 seconds
Testing time is: 0.085 seconds
Accuracy is: 0.9181126592016412 

When C is 1
Training time is: 12.885 seconds
Testing time is: 0.085 seconds
Accuracy is: 0.9319599965809043 

When C is 10
Training time is: 41.705 seconds
Testing time is: 0.104 seconds
Accuracy is: 0.9252072826737328 



In [17]:
# Taking the first 1000 pricipal components
tsvd = TruncatedSVD(n_components = 1000)
train_features_pca = tsvd.fit_transform(train_features)
test_features_pca = tsvd.transform(test_features)

print('Shape of train_features after PCA:',train_features_pca.shape)
print('Shape of test_features after PCA:',test_features_pca.shape)

Shape of train_features after PCA: (70192, 1000)
Shape of test_features after PCA: (23398, 1000)


In [18]:
c_values=[0.1,1,10]
for c_value in c_values:
    clf = svm.LinearSVC(C=c_value)
    print("When C =",c_value,)
    # Training the pca data
    t0 = time()
    clf.fit(train_features_pca, train_label)
    print ("Training time is:", round(time()-t0, 3), "seconds")

    # Testing the model
    t0 = time()
    pred = clf.predict(test_features_pca)
    print ("Testing time is:", round(time()-t0, 3), "seconds")

    acc = accuracy_score(pred, test_label)
    print("Accuracy is:",acc,"\n")

When C is 0.1
Training time is: 46.889 seconds
Testing time is: 0.097 seconds
Accuracy is: 0.8888366527053594 

When C is 1
Training time is: 60.857 seconds
Testing time is: 0.072 seconds
Accuracy is: 0.9013590905205573 

When C is 10
Training time is: 220.753 seconds
Testing time is: 0.182 seconds
Accuracy is: 0.9039234122574579 

