## Notebook Imports

In [2]:
from os import walk
from os.path import join

import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup 
from wordcloud import WordCloud
from PIL import Image
import numpy as np

from sklearn.model_selection import train_test_split

%matplotlib inline

ModuleNotFoundError: No module named 'wordcloud'

## Constants

In [None]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'

SPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
SPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_2'
EASY_NONSPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
EASY_NONSPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'

SPAM_CAT = 1
HAM_CAT = 0
VOCAB_SIZE = 2500

DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'
WORD_ID_FILE = 'SpamData/01_Processing/word-by-id.json'

TRAINING_DATA_FILE = 'SpamData/02_Training/train_data.txt'
TESTING_DATA_FILE = 'SpamData/02_Training/test_data.txt'

WHALE_FILE = 'SpamData/01_Processing/wordcloud_resources/whale-icon.png'
SKULL_FILE = 'SpamData/01_Processing/wordcloud_resources/skull-icon.png'
THUMBS_UP_FILE = 'SpamData/01_Processing/wordcloud_resources/thumbs-up.png'
THUMBS_DOWN_FILE = 'SpamData/01_Processing/wordcloud_resources/thumbs-down.png'
CUSTOM_FONT_FILE = 'SpamData/01_Processing/wordcloud_resources/OpenSansCondensed-bold.ttf'

## Reading Files

In [None]:
stream = open(EXAMPLE_FILE, encoding= 'latin-1')
message = stream.read()
stream.close()

print(type(message))
print(message)

In [None]:
import sys
sys.getfilesystemencoding()

In [None]:
stream = open(EXAMPLE_FILE, encoding= 'latin-1')

is_body = False
lines = []

for line in stream:
    if is_body:
        lines.append(line)
        
    elif line == '\n':
        is_body = True
    
stream.close()

email_body = '\n'.join(lines)
print(email_body)

# Generator Functions

In [None]:
def generate_squares(N):
    for my_number in range(N):
        yield my_number ** 2

In [None]:
for i in generate_squares(3):
    print(i, end= ' ->')
    

## Email Body Extraction

In [None]:
def email_body_generator(path):
    
    for root, dirnames, filenames in walk(path):
        for file_name in filenames:
            
            filepath = join(root, file_name)
        
            stream = open(filepath, encoding= 'latin-1')

            is_body = False
            lines = []

            for line in stream:
                if is_body:
                    lines.append(line)

                elif line == '\n':
                    is_body = True

            stream.close()

            email_body = '\n'.join(lines)
            
            yield file_name , email_body


In [None]:
def df_from_directory(path, classification):
    rows = []
    row_names = []
    
    for file_name, email_body in email_body_generator(path):
        rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
        row_names.append(file_name)
        
    return pd.DataFrame(rows, index= row_names)    

In [None]:
spam_emails = df_from_directory(SPAM_1_PATH, 1)
spam_emails = spam_emails.append(df_from_directory(SPAM_2_PATH, 1))
spam_emails.head()

In [None]:
spam_emails.shape

In [None]:
ham_emails = df_from_directory(EASY_NONSPAM_1_PATH, HAM_CAT)
ham_emails = ham_emails.append(df_from_directory(EASY_NONSPAM_2_PATH, HAM_CAT))
ham_emails.shape

In [None]:
data = pd.concat([spam_emails, ham_emails])
print(data.shape)
data.tail()

## Data Cleaning: Checking For Missing Values

In [None]:
# check if any message bodies are null
data.MESSAGE.isnull().values.any()

In [None]:
# check if there are empty emials(string length zero)
(data.MESSAGE.str.len() == 0).any()

In [None]:
(data.MESSAGE.str.len() == 0).sum()

### Locate empty emails

In [None]:
type(data.MESSAGE.str.len() == 0)

In [None]:
data[data.MESSAGE.str.len() == 0].index

## Remove system file entries from Dataframe

In [None]:
data = data.drop('cmds')
print(data.shape)

## Add document IDs to track emails in dataset

In [None]:
document_ids = range(0, len(data.index))
data['DOC_ID'] = document_ids

In [None]:
data['FILE_NAME'] = data.index
data.set_index('DOC_ID', inplace = True)
data.tail()

# Save to file using pandas

In [None]:
data.to_json(DATA_JSON_FILE)

# Number of spam messages visualized (Pie charts)

In [None]:
data.CATEGORY.value_counts()

In [None]:
amount_of_spam = data.CATEGORY.value_counts()[1]
amount_of_ham = data.CATEGORY.value_counts()[0]


In [None]:
category_names = ['Spam', 'Legit Mail']
sizes = [amount_of_spam, amount_of_ham]

plt.figure(figsize= (2,2), dpi= 227)
plt.pie(sizes, labels = category_names, textprops= {'fontsize': 6}, startangle= 0, autopct = '%1.0f%%')
plt.show()

In [None]:
category_names = ['Spam', 'Legit Mail']
sizes = [amount_of_spam, amount_of_ham]
custom_colors = ['blue', 'red']

plt.figure(figsize= (2,2), dpi= 227)
plt.pie(sizes, labels = category_names, textprops= {'fontsize': 6}, startangle= 0, 
        autopct = '%1.0f%%', colors = custom_colors, explode = (0, 0.1))
plt.show()

In [None]:
category_names = ['Spam', 'Legit Mail']
sizes = [amount_of_spam, amount_of_ham]
custom_colors = ['blue', 'red']

plt.figure(figsize= (2,2), dpi= 227)
plt.pie(sizes, labels = category_names, textprops= {'fontsize': 6}, startangle= 0, 
        autopct = '%1.0f%%', colors = custom_colors, pctdistance = 0.8)
centre_circle = plt.Circle((0, 0), radius = 0.6, fc = 'white')
plt.gca().add_artist(centre_circle)
plt.show()

In [None]:
category_names = ['Spam', 'Legit Mail', 'updates', 'promotions']
sizes = [25, 32, 19, 12]
custom_colors = ['blue', 'red', 'green', 'yellow']
offset = [0.05, 0.05, 0.05, 0.05]

plt.figure(figsize= (2,2), dpi= 227)
plt.pie(sizes, labels = category_names, textprops= {'fontsize': 6}, startangle= 0, 
        autopct = '%1.0f%%', colors = custom_colors, pctdistance = 0.8, explode = offset)
centre_circle = plt.Circle((0, 0), radius = 0.6, fc = 'white')
plt.gca().add_artist(centre_circle)
plt.show()

## Natural Language Processing

### Text Pre- Processing

In [None]:
msg = 'John is a good boy.'
msg.lower()

### Download the NLTK Resources(Tokenizer and stopwords)

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('gutenberg')
nltk.download('shakespeare')

### Tokenising

In [None]:
msg = 'THE John is a good boy.'
word_tokenize(msg.lower())

### Removing stop words

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
if 'this' in stop_words: print('found it')

In [None]:
msg = 'THE John is a good boy. to be not to be'
words = word_tokenize(msg.lower())

filtered_words = []

for word in words:
    if word not in stop_words:
        filtered_words.append(word)
        
print(filtered_words)        

## Word stems and stemming

In [None]:
msg = 'THE John is a good boy. to be not to be! Nobody expects the Spanish inquisition.'
words = word_tokenize(msg.lower())
stemmer = PorterStemmer()

filtered_words = []

for word in words:
    if word not in stop_words:
        stemmed_word = stemmer.stem(word)
        filtered_words.append(stemmed_word)
        
print(filtered_words)   

## Removing Panctuation

In [None]:
msg = 'THE John is a good boy. to be not to be! Nobody expects the Spanish inquisition.'
words = word_tokenize(msg.lower())
stemmer = PorterStemmer()

filtered_words = []

for word in words:
    if word not in stop_words and word.isalpha():
        stemmed_word = stemmer.stem(word)
        filtered_words.append(stemmed_word)
        
print(filtered_words)   

## removing HTML tags from EMAILS

In [None]:
data.at[2, 'MESSAGE']
soup = BeautifulSoup(data.at[214, 'MESSAGE'], 'html.parser')
print(soup.prettify())

In [None]:
soup.get_text()

## Functions for EMAIL processing

In [None]:
def clean_message(message, stemmer = PorterStemmer(),
                 stop_words = set(stopwords.words('english'))):
    
    words = word_tokenize(message.lower())
    
    filtered_words = []
    
    for word in words:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
    
    return filtered_words

In [None]:
clean_message(email_body)

In [None]:
def clean_msg_no_html(message, stemmer = PorterStemmer(),
                 stop_words = set(stopwords.words('english'))):
    
    soup = BeautifulSoup(message, 'html.parser')
    cleaned_text = soup.get_text()
    words = word_tokenize(cleaned_text.lower())
    
    filtered_words = []
    
    for word in words:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            # filtered_words.append(word)
    
    return filtered_words

In [None]:
clean_msg_no_html(data.at[2, 'MESSAGE'])

# Apply cleaning and tokenising to all mesages

### Slicing dataframes and series and cleaning subsets

In [None]:
data.iat[1,2]

In [None]:
data.iloc[5:11]

In [None]:
data.MESSAGE.iloc[0:3]

In [None]:
first_emails = data.MESSAGE.iloc[0:3]

nested_list = first_emails.apply(clean_message)

In [None]:
# flat_list = []
#  for sublist in nested_list:
#      for item in sublist:
#          flat_list.append(item)

flat_list = [item for sub_list in nested_list for item in sub_list]
        
len(flat_list)        

In [None]:
%%time

nested_list = data.MESSAGE.apply(clean_msg_no_html)

### Using logic to slice Dataframes

In [None]:
doc_ids_spam = data[data.CATEGORY == 1].index
doc_ids_ham = data[data.CATEGORY == 0].index

In [None]:
nested_list_ham = nested_list.loc[doc_ids_ham]
nested_list_spam = nested_list.loc[doc_ids_spam]
nested_list_ham.tail()

In [None]:
nested_list_ham.count()

In [None]:
flat_list_ham = [item for sublist in nested_list_ham for item in sublist]
normal_words = pd.Series(flat_list_ham).value_counts()

print(normal_words.shape[0])
normal_words[:10]

In [None]:
flat_list_spam = [item for sublist in nested_list_spam for item in sublist]
spammy_words = pd.Series(flat_list_spam).value_counts()

print(spammy_words.shape[0])
spammy_words[:10]

## Creating a Word Cloud

In [None]:
word_cloud = WordCloud().generate(email_body)
plt.imshow(word_cloud)
plt.show()

In [None]:
example_corpus = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
type(example_corpus)

In [None]:
word_list = [''.join(word) for word in example_corpus]
type(word_list)

In [None]:
novel_as_string = ' '.join(word_list)
# novel_as_string

In [None]:
icon = Image.open(WHALE_FILE)
image_mask = Image.new(mode='RGB', size=icon.size, color=(255, 255, 255))
image_mask.paste(icon, box= icon)

rgb_array = np.array(image_mask) # converts an image object to an array

plt.figure(figsize = (16, 8))
word_cloud = WordCloud(mask=rgb_array, background_color='white', max_words=400, colormap = 'ocean')
word_cloud.generate(novel_as_string)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
example_corpus2 = nltk.corpus.shakespeare.words('hamlet.xml')
type(example_corpus2)

In [None]:
novel_hamlet = ' '.join(example_corpus2)
# novel_hamlet

In [None]:
icon = Image.open(SKULL_FILE)
image_mask = Image.new(mode='RGB', size=icon.size, color=(255, 255, 255))
image_mask.paste(icon, box= icon)

rgb_array = np.array(image_mask) # converts an image object to an array

plt.figure(figsize = (16, 8))
word_cloud = WordCloud(mask=rgb_array, background_color='white', max_words=400, colormap = 'bone')
word_cloud.generate(novel_hamlet)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## Word Cloud of ham and spam messages

In [None]:
icon = Image.open(THUMBS_UP_FILE)
image_mask = Image.new(mode='RGB', size=icon.size, color=(255, 255, 255))
image_mask.paste(icon, box= icon)

rgb_array = np.array(image_mask) # converts an image object to an array

ham_str = ' '.join(flat_list_ham)

plt.figure(figsize = (16, 8))
word_cloud = WordCloud(mask=rgb_array, background_color='white', max_words=2000, colormap = 'winter')
word_cloud.generate(ham_str.upper())
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
icon = Image.open(THUMBS_DOWN_FILE)
image_mask = Image.new(mode='RGB', size=icon.size, color=(255, 255, 255))
image_mask.paste(icon, box= icon)

rgb_array = np.array(image_mask) # converts an image object to an array

spam_str = ' '.join(flat_list_spam)

plt.figure(figsize = (16, 8))
word_cloud = WordCloud(mask=rgb_array, background_color='white', max_words=2000, colormap = 'gist_heat', 
                      max_font_size=300, font_path=CUSTOM_FONT_FILE)
word_cloud.generate(spam_str.upper())
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Generate Vocabulary and Dictionary

In [None]:
stemmed_nested_list = data.MESSAGE.apply(clean_msg_no_html)
flat_stemmed_list = [item for sublist in stemmed_nested_list for item in sublist]

In [None]:
unique_words = pd.Series(flat_stemmed_list).value_counts()
print('Nr of unique words', unique_words.shape[0])

In [None]:
frequent_words = unique_words[0:VOCAB_SIZE]
print('most comman words: \n', frequent_words[:10])

## Create Vocabulary Dataframe with a WORD_ID

In [None]:
word_ids = list(range(0, VOCAB_SIZE))
vocab = pd.DataFrame({'VOCAB_WORD': frequent_words.index.values}, index= word_ids)
vocab.index.name = 'WORD_ID'
vocab.head()

## Save the vocabulary as a CSV

In [None]:
vocab.to_csv(WORD_ID_FILE, index_label= vocab.index.name, header=vocab.VOCAB_WORD.name)

In [None]:
'machin' in set(vocab.VOCAB_WORD)

## Email with most number of words

In [None]:
# for loop
clean_email_lengths = []
for sublist in stemmed_nested_list:
    clean_email_lengths.append(len(sublist))
    
print('Nr of words in the longest email: ', max(clean_email_lengths))    

In [None]:
print('Email position in the list: ', np.argmax(clean_email_lengths))

In [None]:
# stemmed_nested_list[np.argmax(clean_email_lengths)]
data.at[np.argmax(clean_email_lengths), 'MESSAGE']

## Generate features and a sparse matrix

### Creating a Data frame with one word per column

In [None]:
type(stemmed_nested_list)

In [None]:
type(stemmed_nested_list.tolist())

In [None]:
word_columns_df = pd.DataFrame.from_records(stemmed_nested_list.tolist())
word_columns_df

## Spliting the data to training and testing dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(word_columns_df, data.CATEGORY, test_size = 0.3, random_state = 42)

In [None]:
X_train.index.name = y_train.index.name = 'DOC_ID'
X_train.head()

### Create a Sparse matrix for the training data

In [None]:
word_index = pd.Index(vocab.VOCAB_WORD)
type(word_index)

In [None]:
def make_sparse_matrix(df, indexed_words, labels):
    """
    Returns Sparse matrix as a Data Frame.
    
    df: A dataframe with words in the columns with a document id as an index (X_train or X_test)
    indexed_words: index of words ordered by word_id.
    labels: category as a series(y_train)
    """
    
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL': category, 'DOC_ID': doc_id, 'OCCURENCE': 1, 'WORD_ID': word_id}
                dict_list.append(item)
    
    return pd.DataFrame(dict_list)

In [None]:
%%time
sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)

In [None]:
sparse_train_df.head()

### Combine occurances with the pandas groupby() method

In [None]:
train_grouped = sparse_train_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum().reset_index()
train_grouped.head()

In [None]:
vocab.at[0, 'VOCAB_WORD']

### Save Training data as a .txt file

In [None]:
np.savetxt(TRAINING_DATA_FILE, train_grouped, fmt = '%d')

## Create a sparse matrix for testing data

In [None]:
%%time
sparse_test_df = make_sparse_matrix(X_test, word_index, y_test )

In [None]:
test_grouped = sparse_test_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum().reset_index()
test_grouped.head()

In [None]:
np.savetxt(TESTING_DATA_FILE, test_grouped, fmt='%d')

## Pre - Processing Subtleties

In [None]:
train_doc_ids = set(train_grouped.DOC_ID)
test_doc_ids = set(test_grouped.DOC_ID)

In [None]:
len(test_doc_ids)

In [None]:
len(X_test)

In [None]:
set(X_test.index.values) - test_doc_ids

In [None]:
data.MESSAGE[134]

In [None]:
clean_msg_no_html(data.MESSAGE[134])