## Enron Dataset - Topic Modelling

Enron dataset is having 500k records. For efficient memory usage, I am using sample of 10k records from Enron dataset to get the relevant topics.

### Splitting the data file into smaller files

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
#!mkdir split_files

#file_num=1
full_data=pd.read_csv("C:\\Users\\nivey\\Python\\NLP\\email_classification\\emails.csv")
email_subset = full_data.sample(frac=0.02, random_state=1)
#for i in range(0,full_data.shape[0],10000):
#    split_data=full_data.iloc[i:i+10000]
#    split_data.to_csv('split_files/split_file_' + str(file_num) + '.csv',index=False) 
#    file_num+=1
#data=pd.read_csv('C:\\Users\\nivey\\Python\\NLP\\email_classification\\Practice\\split_files\\split_file_1.csv')
#data.head(2)

### Data Cleaning

Cleaning the data to process it using LDA model for getting relevant topics.

In [3]:
email_subset.head()

Unnamed: 0,file,message
186822,jones-t/all_documents/634.,Message-ID: <17820178.1075846925335.JavaMail.e...
308790,mann-k/all_documents/5690.,Message-ID: <29110382.1075845717882.JavaMail.e...
82383,dasovich-j/sent/423.,Message-ID: <6812040.1075843194135.JavaMail.ev...
227299,kaminski-v/var/63.,Message-ID: <21547648.1075856642126.JavaMail.e...
301824,mann-k/_sent_mail/3208.,Message-ID: <12684200.1075846107179.JavaMail.e...


In [4]:
def split_message_text(raw_message):
    lines=raw_message.split('\n')
    email_info={}
    keys_needed=['from','to']
    combined_msg=''
    for line in lines:
        if ':' not in line:
            combined_msg += line.strip()
            email_info['email'] = combined_msg
        else:
            pairs=line.split(':')
            keys=pairs[0].lower()
            values=pairs[1].strip()
            if keys in keys_needed:
                email_info[keys]=values
        
    return email_info

In [5]:
def get_email_content(messages):
    email_content = [split_message_text(raw_message) for raw_message in messages]
    return email_content

In [6]:
email_df = pd.DataFrame(get_email_content(email_subset.message))

In [7]:
email_df.head()

Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,"It would be nice if you could be at my dinner,..."
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,Absolutely.Good point! Can Peter start to dra...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,My apologies. My schedule melted down after w...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...","Vince,UK VAR breached the limit last week.UK t..."
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",Any problems/comments?AM ---------------------...


In [8]:
email_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10348 entries, 0 to 10347
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   from    10348 non-null  object
 1   to      9916 non-null   object
 2   email   10348 non-null  object
dtypes: object(3)
memory usage: 242.7+ KB


In [9]:
# cleaning the text by removing the punctuations and making the text lower case
email_df.email = email_df.email.apply(lambda x: pd.Series(re.sub(r'[^a-z\s]', ' ',x.lower())))

In [10]:
email_df.head()

Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,it would be nice if you could be at my dinner ...
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point can peter start to dra...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,my apologies my schedule melted down after w...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached the limit last week uk t...
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",any problems comments am ...


In [11]:
content_clean_tokens = email_df.email.apply(lambda x: nltk.word_tokenize(x))

In [12]:
content_clean_tokens

0        [it, would, be, nice, if, you, could, be, at, ...
1        [absolutely, good, point, can, peter, start, t...
2        [my, apologies, my, schedule, melted, down, af...
3        [vince, uk, var, breached, the, limit, last, w...
4        [any, problems, comments, am, dale, rasmussen,...
                               ...                        
10343    [attached, is, a, redline, with, the, changes,...
10344    [for, your, review, forwarded, by, andy, zippe...
10345    [hi, jerry, do, we, have, a, final, execution,...
10346    [richard, shapiro, enron, com, david, parquet,...
10347    [better, original, message, original, message,...
Name: email, Length: 10348, dtype: object

In [13]:
# More cleaning by tokeinzing, removing stop words, detecting bigrams, lemmatization

from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models.phrases import Phraser,Phrases
from nltk.stem import WordNetLemmatizer

#bigram
# input --> full_content - List(List)
bigram = Phrases(content_clean_tokens,threshold=100,min_count=10)
bigram_mod = Phraser(bigram)

# stop_words function 
# input - list of tokenized words of each content
# output - list of words without stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're','outlook','e','aa','ab','ac','yo','yr','yf','wr','u'])
def stop_words_func(text):
    return [word for word in text if word not in stop_words]

# lemmatize function
lem=WordNetLemmatizer()
def lemmatize(text):
    return [lem.lemmatize(w) for w in text]

# Lemmatize function with pos_tag
def lemmatize_pos(text):
    pos_tag = nltk.pos_tag(text)
    return [word for (word,pos) in pos_tag if pos[0] in ['J','N','V','R']]   

# data_clean function
def data_clean(content_list):
    
    content_list = [word for word in content_list if len(word) > 1]
    content_list_bigram = bigram_mod[content_list]
    content_list_nostop = stop_words_func(content_list_bigram)
    content_list_Lemma = lemmatize(content_list_nostop)
    content_list_pos = lemmatize_pos(content_list_Lemma)

    return ' '.join(content_list_pos)

In [14]:
email_df.email = email_df.email.apply(lambda x: data_clean(nltk.word_tokenize(x)))

In [15]:
email_df.head()

Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,nice dinner probably knowanyone else anytime w...
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point peter start draft overri...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,apology schedule melted talked monday swhere f...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached limit last week uk trade...
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",problem comment dale_rasmussen ectmann corp en...


In [16]:
# save to pickle
email_df.to_pickle('corpus.pkl')

In [17]:
email_df.to_csv('email_df.csv',index=False)