## Enron Dataset - Topic Modelling

Enron dataset is having 500k records. For efficient memory usage, I am using sample of 10k records from Enron dataset to get the relevant topics.

### Splitting the data file into smaller files

In [1]:
# Imports
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
#!mkdir split_files

#file_num=1
full_data=pd.read_csv("C:\\Users\\nivey\\Python\\NLP\\email_classification\\emails.csv")
email_subset = full_data.sample(frac=0.02, random_state=1)
#for i in range(0,full_data.shape[0],10000):
#    split_data=full_data.iloc[i:i+10000]
#    split_data.to_csv('split_files/split_file_' + str(file_num) + '.csv',index=False) 
#    file_num+=1
#data=pd.read_csv('C:\\Users\\nivey\\Python\\NLP\\email_classification\\Practice\\split_files\\split_file_1.csv')
#data.head(2)

### Data Cleaning

Cleaning the data to process it using LDA model for getting relevant topics.

In [3]:
email_subset.head()

Unnamed: 0,file,message
186822,jones-t/all_documents/634.,Message-ID: <17820178.1075846925335.JavaMail.e...
308790,mann-k/all_documents/5690.,Message-ID: <29110382.1075845717882.JavaMail.e...
82383,dasovich-j/sent/423.,Message-ID: <6812040.1075843194135.JavaMail.ev...
227299,kaminski-v/var/63.,Message-ID: <21547648.1075856642126.JavaMail.e...
301824,mann-k/_sent_mail/3208.,Message-ID: <12684200.1075846107179.JavaMail.e...


In [4]:
# function to split email text from the message column
def split_message_text(raw_message):
    lines=raw_message.split('\n')
    email_info={}
    keys_needed=['from','to']
    combined_msg=''
    for line in lines:
        if ':' not in line:
            combined_msg += line.strip()
            email_info['email'] = combined_msg
        else:
            pairs=line.split(':')
            keys=pairs[0].lower()
            values=pairs[1].strip()
            if keys in keys_needed:
                email_info[keys]=values
        
    return email_info

In [5]:
def get_email_content(messages):
    email_content = [split_message_text(raw_message) for raw_message in messages]
    return email_content

In [6]:
# Get the required columns from the main dataset
email_df = pd.DataFrame(get_email_content(email_subset.message))

In [7]:
email_df.head()

Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,"It would be nice if you could be at my dinner,..."
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,Absolutely.Good point! Can Peter start to dra...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,My apologies. My schedule melted down after w...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...","Vince,UK VAR breached the limit last week.UK t..."
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",Any problems/comments?AM ---------------------...


In [8]:
# Checking for null values
email_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10348 entries, 0 to 10347
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   from    10348 non-null  object
 1   to      9916 non-null   object
 2   email   10348 non-null  object
dtypes: object(3)
memory usage: 242.7+ KB


In [9]:
# creating new dataframe with email text alone
df = pd.DataFrame(email_df['email'])

In [10]:
df.head()

Unnamed: 0,email
0,"It would be nice if you could be at my dinner,..."
1,Absolutely.Good point! Can Peter start to dra...
2,My apologies. My schedule melted down after w...
3,"Vince,UK VAR breached the limit last week.UK t..."
4,Any problems/comments?AM ---------------------...


In [11]:
# Data cleaning - removing punctuations and converting to lower cases
df.email = df.email.apply(lambda x: pd.Series(re.sub(r'[^a-z\s]', ' ',x.lower())))

In [12]:
df.head()

Unnamed: 0,email
0,it would be nice if you could be at my dinner ...
1,absolutely good point can peter start to dra...
2,my apologies my schedule melted down after w...
3,vince uk var breached the limit last week uk t...
4,any problems comments am ...


In [13]:
# Spacy import
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm", disable=['parser','ner'])

In [14]:
# Removing stop words and lemmatizing the tokens of the email text for each document using Spacy
def normalize(doc, lowercase, remove_stopwords):
    if lowercase:
        doc = doc.lower()
    doc = nlp(doc)
    lemmatized = list()
    for token in doc:
        lemma = token.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in STOP_WORDS):
                lemmatized.append(lemma)
    return " ".join(lemmatized)

In [15]:
df['email'] = df['email'].apply(normalize, lowercase=True, remove_stopwords=True)

In [16]:
df.head()

Unnamed: 0,email
0,nice dinner I probably win t knowanyone anytim...
1,absolutely good point peter start draft overri...
2,apology schedule melt talk monday swhere folk ...
3,vince uk var breach limit week uk trader ask r...
4,problem comment dale rasmussen ectmann corp en...


### Document Term Matrix

In [17]:
# Import Count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer(max_df=0.95,min_df=5,stop_words='english')

In [19]:
# document term matrix
dtm = cv.fit_transform(df['email'])

In [20]:
dtm

<10348x14926 sparse matrix of type '<class 'numpy.int64'>'
	with 689372 stored elements in Compressed Sparse Row format>

### Topic modelling -  LDA Model

In [21]:
from sklearn.decomposition import LatentDirichletAllocation

In [22]:
lda = LatentDirichletAllocation(n_components=12,random_state=100)

In [23]:
lda.fit(dtm)

LatentDirichletAllocation(n_components=12, random_state=100)

In [24]:
len(cv.get_feature_names())

14926

In [25]:
# grab the highest probability words from each topic
for index,topic in enumerate(lda.components_):
    print(f'The top 15 words for topic # {index} are:')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic # 0 are:
['com', 'wr', 'sunday', 'good', 'game', 'net', 'mail', 'update', 'yahoo', 'week', 'hotmail', 'aol', 'haas', 'berkeley', 'edu']


The top 15 words for topic # 1 are:
['run', 'like', 'ena', 'play', 'point', 'ut', 'product', 'allow', 'pass', 'texas', 'tx', 'year', 'team', 'game', 'way']


The top 15 words for topic # 2 are:
['rate', 'need', 'deal', 'day', 'storage', 'volume', 'point', 'contract', 'pipeline', 'capacity', 'com', 'image', 'et', 'gas', 'enron']


The top 15 words for topic # 3 are:
['day', 'deal', 'jones', 'million', 'trade', 'trading', 'transaction', 'business', 'year', 'new', 'stock', 'market', 'price', 'company', 'enron']


The top 15 words for topic # 4 are:
['look', 'day', 'good', 'deal', 'want', 'like', 'think', 'work', 'time', 'let', 'need', 'original', 'message', 'thank', 'know']


The top 15 words for topic # 5 are:
['ferc', 'commission', 'rate', 'enron', 'cost', 'plant', 'gas', 'electricity', 'price', 'utility', 'market', 'califor

In [26]:
topic_results = lda.transform(dtm)

In [27]:
# mapping the relevant topics to each document
email_df['Topic'] = topic_results.argmax(axis=1)

In [28]:
email_df.sample(5)

Unnamed: 0,from,to,email,Topic
6349,"Kitchen, Louise","Perlman, Beth; Beck, Sally; Pickering, Mark; P...",I am not sure what New Start is? Any details?-...,4
3260,christie.patrick@enron.com,Lauren Iannarone/NY/ECT@ECT,paula.rieker@enron.com...FYI.. Christie.-----...,4
1109,Milenia I Soto <misoto@juno.com>@ENRON,"Daffin, Margaret",FYIvince-----Original Message-----Here is what...,7
845,eric.bass@enron.com,"""'eric.bass@enron.com'"" <eric.bass@enron.com>","---------------------------AMEric,I'm interest...",4
4933,eddie.sera@enron.com,"rebecca.carter@enron.com, tina.spiller@enron.com,","joannie.williamson@enron.com, tracy.ramsey@enr...",6


In [29]:
# saving the output dataframe with topics to csv file for future use
email_df.to_csv('email_df.csv',index=False)