# Install Dependencies


In [1]:
# works on python 3.9.6
%pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd


In [3]:
import numpy as np


# Read CSV


In [4]:
df = pd.read_csv('~/Desktop/TextClassification/emails.csv')

# currently working on just first 10_000 emails dataset.
df = df.iloc[:10000]

In [5]:
df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


# Clean Data


In [6]:
df['message'].iloc[1]

"Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>\nDate: Fri, 4 May 2001 13:51:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: john.lavorato@enron.com\nSubject: Re:\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nTraveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what 

In [7]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['Message-ID', 'Date', 'From', 'X-To', 'Subject', 'X-cc', 'X-bcc']
    for line in lines:
        if ':' not in line:
            message += line.strip() + ' '
            email['Body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0]
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': [email.get('Body', '') for email in emails],
        'subject': [email.get('Subject', '') for email in emails],
        'to': [email.get('X-To', '') for email in emails],
        'from': [email.get('From', '') for email in emails],
        'date': [email.get('Date', '') for email in emails],
        'cc': [email.get('X-cc', '') for email in emails],
        'bcc': [email.get('X-bcc', '') for email in emails],
    }

In [8]:
email_df = pd.DataFrame(parse_into_emails(df.message))

In [9]:
email_df.head()


Unnamed: 0,body,subject,to,from,date,cc,bcc
0,Here is our forecast,,Tim Belden <Tim Belden/Enron@EnronXGate>,phillip.allen@enron.com,"Mon, 14 May 2001 16",,
1,Traveling to have a business meeting takes th...,Re,John J Lavorato <John J Lavorato/ENRON@enronXg...,phillip.allen@enron.com,"Fri, 4 May 2001 13",,
2,test successful. way to go!!!,Re,Leah Van Arsdall,phillip.allen@enron.com,"Wed, 18 Oct 2000 03",,
3,"Randy, Can you send me a schedule of the sal...",,Randall L Gay,phillip.allen@enron.com,"Mon, 23 Oct 2000 06",,
4,,Re,Greg Piper,phillip.allen@enron.com,"Thu, 31 Aug 2000 05",,


In [10]:

import re
def clean_text(text, lenNeeded: int):
    text = re.sub(r'(From:.*?\n)|(Sent:.*?\n)|(To:.*?\n)|(Subject:.*?\n)',
                  '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    if len(text) < lenNeeded:
        return ''
    return text.strip()


In [11]:
cleanedBody = email_df['body'].apply(lambda x: clean_text(x, 10))

In [12]:
cleanedSubject = email_df['subject'].apply(lambda x: clean_text(x, 5))

In [13]:
email_df['body'] = cleanedBody
email_df['subject'] = cleanedSubject

In [14]:
# remove empty rows
email_df = email_df[(email_df['body'] != '') & (email_df['to'] != '') & (email_df['from'] != '')]

In [15]:
# final dataframe
email_df.head()

Unnamed: 0,body,subject,to,from,date,cc,bcc
0,here is our forecast,,Tim Belden <Tim Belden/Enron@EnronXGate>,phillip.allen@enron.com,"Mon, 14 May 2001 16",,
1,traveling to have a business meeting takes the...,,John J Lavorato <John J Lavorato/ENRON@enronXg...,phillip.allen@enron.com,"Fri, 4 May 2001 13",,
2,test successful way to go,,Leah Van Arsdall,phillip.allen@enron.com,"Wed, 18 Oct 2000 03",,
3,randy can you send me a schedule of the salary...,,Randall L Gay,phillip.allen@enron.com,"Mon, 23 Oct 2000 06",,
5,greg how about either next tuesday or thursday...,,Greg Piper,phillip.allen@enron.com,"Thu, 31 Aug 2000 04",,


### This Dataset is Ready to be used. # SUEEE!!!!

# Preprocessing


## Word Vectorization


In [118]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [131]:
# vectorization (max_df is remove words occuring more than 80% of times to increase efficiency of vectors exlcuding words coming too frequcntly)
tfidf_vectorizer_body = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=2, max_features=500)


In [132]:
# Fit: Learns the vocabulary of the corpus (all unique words or terms) and computes the inverse document frequency (IDF) for each term.
# Transform: Converts the input text data into TF-IDF vectors based on the learned vocabulary and IDF weights.
tfidf_sparse_body = tfidf_vectorizer_body.fit_transform(email_df['body'] + email_df['subject'])


In [133]:
# sparse array is array we created in one-hot encoding having many zeroes, optimizes spaces having zeroes (takes less space, computation)
# dense array is normal array, easier to work with but stores every value distinctly.
tfidf_dense_body = tfidf_sparse_body.toarray()
tfidf_dense_body = [x for x in tfidf_dense_body if 1 in x]


In [134]:
tfidf_dense_body

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 

# Extact Features

In [135]:
def top_feats_in_doc(X, features, row_id, top_n=30):
    row = np.squeeze(X[row_id])
    if top_n > len(row):
        top_n = len(row)
    top_indices = np.argsort(row)[-top_n:][::-1]
    
    top_feats = [(features[i], row[i]) for i in top_indices]
    return pd.DataFrame(top_feats, columns=['features', 'score'])

In [136]:
features = tfidf_vectorizer_body.get_feature_names_out()
topFeatures = top_feats_in_doc(pd.Series(tfidf_dense_body), features, 1, 30)

In [137]:
topFeatures

Unnamed: 0,features,score
0,coming,1.0
1,youre,0.0
2,forward,0.0
3,fax,0.0
4,feedback,0.0
5,feel,0.0
6,ferc,0.0
7,file,0.0
8,final,0.0
9,financial,0.0


In [141]:
def top_tfidf_feats(tfidf_means, features, top_n):
    # Get the indices of the top N features
    top_indices = np.argsort(tfidf_means)[-top_n:][::-1]
    
    # Extract the top features and their corresponding scores
    top_feats = [(features[i], tfidf_means[i]) for i in top_indices]
    
    # Return as a DataFrame
    return pd.DataFrame(top_feats, columns=['features', 'score'])


def top_mean_feats(X, features,
 grp_ids=None, min_tfidf=0.1, top_n=30):
    if grp_ids:
        D = X[grp_ids].toarray()
    else:
        D = X.toarray()    
        D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [142]:
top_features = top_mean_feats(X = tfidf_sparse_body, features = features, top_n=30)

In [143]:
top_features

Unnamed: 0,features,score
0,phillip,0.040378
1,message,0.038072
2,john,0.035193
3,original,0.034225
4,thanks,0.028673
5,enron,0.028064
6,email,0.02597
7,gas,0.024555
8,know,0.021059
9,need,0.019445
