In [22]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import *
import random

In [49]:
def load_data():
    spam_paths = []
    ham_paths = []
    for subdir, dirs, files in os.walk('data'):
        for file in files:
            if file != 'Summary.txt' and file[-4:] == '.txt':
                file_path = os.path.join(subdir, file)
                if file_path[-8:] == 'spam.txt':
                    spam_paths.append(file_path)
                elif file_path[-7:] == 'ham.txt':
                    ham_paths.append(file_path)
                    
    dataset = [(path,1) for path in spam_paths] + [(path,0) for path in ham_paths]
    random.shuffle(dataset)
    
    
    train_ratio = 0.8
    validation_ratio = 0.1
    num_train = int(train_ratio * len(dataset))
    num_validation = int(validation_ratio * len(dataset))
    
    train_set = dataset[:num_train]
    validation_set = dataset[num_train : num_train + num_validation]
    test_set = dataset[num_train + num_validation:]
    
    train_X, train_y = zip(*train_set)
    validation_X, validation_y = zip(*validation_set)
    test_X, test_y = zip(*test_set)
    
    
    print(f'Number of files: {len(spam_paths)+len(ham_paths)}')
    print(f'Number of spam: {len(spam_paths)}')
    print(f'Number of ham: {len(ham_paths)}')
    print(f"Size of train_set: {len(train_set)}")
    print(f"Size of validation_set: {len(validation_set)}")
    print(f"Size of test_set: {len(test_set)}")
    
    return train_X, train_y, validation_X, validation_y, test_X, test_y

In [50]:
train_X, train_y, validation_X, validation_y, test_X, test_y = load_data()

Number of files: 33716
Number of spam: 17171
Number of ham: 16545
Size of train_set: 26972
Size of validation_set: 3371
Size of test_set: 3373


In [55]:
print(train_X[:20])

('data/enron4/spam/3202.2004-11-20.GP.spam.txt', 'data/enron1/spam/3183.2004-12-15.GP.spam.txt', 'data/enron3/ham/4437.2001-12-05.kitchen.ham.txt', 'data/enron5/spam/1732.2005-06-22.SA_and_HP.spam.txt', 'data/enron3/spam/0453.2004-09-12.BG.spam.txt', 'data/enron6/ham/5754.2002-02-28.lokay.ham.txt', 'data/enron4/spam/4004.2005-01-23.GP.spam.txt', 'data/enron6/spam/0167.2004-08-12.BG.spam.txt', 'data/enron5/spam/4475.2005-07-19.SA_and_HP.spam.txt', 'data/enron2/spam/3854.2005-07-04.SA_and_HP.spam.txt', 'data/enron6/spam/4448.2005-04-18.BG.spam.txt', 'data/enron1/ham/5034.2001-11-01.farmer.ham.txt', 'data/enron1/spam/1313.2004-06-09.GP.spam.txt', 'data/enron4/spam/0110.2004-01-03.GP.spam.txt', 'data/enron3/ham/0527.2001-04-06.kitchen.ham.txt', 'data/enron2/spam/2053.2005-06-22.SA_and_HP.spam.txt', 'data/enron3/ham/2088.2001-08-21.kitchen.ham.txt', 'data/enron6/spam/1005.2004-10-10.BG.spam.txt', 'data/enron6/ham/0019.2000-06-09.lokay.ham.txt', 'data/enron5/spam/1704.2005-06-22.SA_and_HP.sp

In [57]:
train_y.count(0)
train_y.count()

13225

In [19]:
f = open(spam_paths[100], 'r')
words = []
for line in f:
    words += line.split()
print(words)

['Subject:', 'new', 'product', '!', 'cialis', 'soft', 'tabs', '.', 'hi', '!', 'we', 'have', 'a', 'new', 'product', 'that', 'we', 'offer', 'to', 'you', ',', 'c', '_', 'i', '_', 'a', '_', 'l', '_', 'i', '_', 's', 'soft', 'tabs', ',', 'cialis', 'soft', 'tabs', 'is', 'the', 'new', 'impotence', 'treatment', 'drug', 'that', 'everyone', 'is', 'talking', 'about', '.', 'soft', 'tabs', 'acts', 'up', 'to', '36', 'hours', ',', 'compare', 'this', 'to', 'only', 'two', 'or', 'three', 'hours', 'of', 'viagra', 'action', '!', 'the', 'active', 'ingredient', 'is', 'tadalafil', ',', 'same', 'as', 'in', 'brand', 'cialis', '.', 'simply', 'disolve', 'half', 'a', 'pill', 'under', 'your', 'tongue', ',', '10', 'min', 'before', 'sex', ',', 'for', 'the', 'best', 'erections', 'you', "'", 've', 'ever', 'had', '!', 'soft', 'tabs', 'also', 'have', 'less', 'sidebacks', '(', 'you', 'can', 'drive', 'or', 'mix', 'alcohol', 'drinks', 'with', 'them', ')', '.', 'you', 'can', 'get', 'it', 'at', ':', 'http', ':', '/', '/', 'th

In [9]:
stemmer = PorterStemmer()
stemmedWords = [stemmer.stem(word) for word in words]

stemmedWords

In [11]:
print(stemmedWords)

['subject:', 'new', 'product', '!', 'ciali', 'soft', 'tab', '.', 'hi', '!', 'we', 'have', 'a', 'new', 'product', 'that', 'we', 'offer', 'to', 'you', ',', 'c', '_', 'i', '_', 'a', '_', 'l', '_', 'i', '_', 's', 'soft', 'tab', ',', 'ciali', 'soft', 'tab', 'is', 'the', 'new', 'impot', 'treatment', 'drug', 'that', 'everyon', 'is', 'talk', 'about', '.', 'soft', 'tab', 'act', 'up', 'to', '36', 'hour', ',', 'compar', 'thi', 'to', 'onli', 'two', 'or', 'three', 'hour', 'of', 'viagra', 'action', '!', 'the', 'activ', 'ingredi', 'is', 'tadalafil', ',', 'same', 'as', 'in', 'brand', 'ciali', '.', 'simpli', 'disolv', 'half', 'a', 'pill', 'under', 'your', 'tongu', ',', '10', 'min', 'befor', 'sex', ',', 'for', 'the', 'best', 'erect', 'you', "'", 've', 'ever', 'had', '!', 'soft', 'tab', 'also', 'have', 'less', 'sideback', '(', 'you', 'can', 'drive', 'or', 'mix', 'alcohol', 'drink', 'with', 'them', ')', '.', 'you', 'can', 'get', 'it', 'at', ':', 'http', ':', '/', '/', 'the', '-', 'rxsite', '.', 'com', '/'

In [None]:
def tfidf(corpus)