In [43]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import *
from nltk.tokenize import RegexpTokenizer
import random
from collections import OrderedDict
import torch

In [44]:
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

In [57]:
def load_data():
    spam_paths = np.array([])
    ham_paths = np.array([])
    for subdir, dirs, files in os.walk('data'):
        for file in files:
            if file != 'Summary.txt' and file[-4:] == '.txt':
                file_path = os.path.join(subdir, file)
                if file_path[-8:] == 'spam.txt':
#                     spam_paths.append(file_path)
                    spam_paths = np.append(spam_paths, file_path)
                elif file_path[-7:] == 'ham.txt':
#                     ham_paths.append(file_path)
                    ham_paths = np.append(ham_paths, file_path)
                    
    dataset = [(path,1) for path in spam_paths] + [(path,0) for path in ham_paths]
    random.shuffle(dataset)
    
    train_ratio = 0.8
    validation_ratio = 0.1
    num_train = int(train_ratio * len(dataset))
    num_validation = int(validation_ratio * len(dataset))
    
    train_set = dataset[:num_train]
    validation_set = dataset[num_train : num_train + num_validation]
    test_set = dataset[num_train + num_validation:]
    
    train_X, train_y = zip(*train_set)
    validation_X, validation_y = zip(*validation_set)
    test_X, test_y = zip(*test_set)
    
    
    print(f'Number of files: {len(spam_paths)+len(ham_paths)}')
    print(f'Number of spam: {len(spam_paths)}')
    print(f'Number of ham: {len(ham_paths)}')
    print(f"Size of train_set: {len(train_set)}")
    print(f"Size of validation_set: {len(validation_set)}")
    print(f"Size of test_set: {len(test_set)}")
    
    return train_X, train_y, validation_X, validation_y, test_X, test_y

In [48]:
def iter_dataset(dataset_X):
    
    for filePath in dataset_X:
        file = open(filePath, 'r', errors='ignore')
        yield file
        file.close()
        

In [49]:
def get_tfs(file):
    result = {}
    num_words = 0
#     print(file)
    for line in file:
#         print(line)
        words = tokenizer.tokenize(line)
        words = [stemmer.stem(word) for word in words]
        for word in words:
            if word in result.keys():
                result[word] += 1
            else:
                result[word] = 1
                
            num_words += 1
            
    for key in result:
        result[key] = result[key] / num_words
    
    return result

In [50]:
def dict_stats(d):
    """
    Return sorted dictionary by value.
    """
    result = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}
    return result

In [51]:
def get_idfs(corpus):
    """
    Calculates the idf for every term in the document
    """
    DF = {} #dictionary --> {word: set() of documents in which word present} key:word, value: set
    num_documents = len(corpus)
    for file in iter_dataset(corpus):
#         print(file)
        for line in file:
#             print(line)
            words = tokenizer.tokenize(line)
            words = [stemmer.stem(word) for word in words]
            for word in words:
                try:
                    DF[word].add(file)
                except:
                    DF[word] = {file}
    
    for key in DF.keys():
#         print(len(DF[key]))
        DF[key] = np.log(num_documents/(len(DF[key])+1))
    
    return DF
                    
            

In [52]:
def get_tfidf(file, idfs):
    tfs = get_tfs(file)
    tfidfs = {}
    for key in tfs:
        tfidfs[key] = tfs[key] * idfs[key]
        
    return tfidfs

In [53]:
def train(idfs):
    for file in iter_dataset(train_X[:3]):
        tfidfs = get_tfidf(file, idfs)
        top20Words = [*tfidfs.keys()][-20:]
#         print(dict_stats(top20Words))
        print(top20Words)
        input()
        #pass it to the logistic regression unit


In [58]:
train_X, train_y, validation_X, validation_y, test_X, test_y = load_data()

Number of files: 33716
Number of spam: 17171
Number of ham: 16545
Size of train_set: 26972
Size of validation_set: 3371
Size of test_set: 3373


In [None]:
idfs = get_idfs(train_X)

In [36]:
train(idfs)

['subject', 'news', 'articl', 'on', 'enron', 'india', 'aftermath', 'busi', 'line', '02', '21', '2000', 'abhay', 'mehta', 'copyright', 'c', 'kasturi', 'sourc', 'world', 'report']

['subject', 're', 'industri', 'robert', 'pleas', 'make', 'sure', 'that', 'gari', 'get', 'a', 'copi', 'of', 'the', 'spreadsheet', 'nobodi', 'ha', 'seen', 'it', 'in']

['subject', 'largest', 'pornstar', 'collect', 'of', 'download', 'porn', 'd', 'movl', 'x', '881', 'cum', 'wit', 'the', 'most', 'extrem', 'sexual', 'achiev', 'ever', 'to']



In [76]:
def sigmoid(z):
    """ Returns sigmoid of z """
    return 1/(1+np.exp(-z))

In [77]:
def initialize_with_zeros(dim):
    """ Initialize w array with dim*1 zeros and b as 0"""
    w = np.zeros(dim)
    b = 0
    
    return w, b

In [93]:
def propagate(w, b, X, y):
    """
    Apply forward and backward propagation.
    
    @param w: weights 
    @precond: np array shape n*1
    
    @param b: bias 
    @precond: float
    
    @param X: the features 
    @precond: np array shape n*m n is the number of features and m the number of training examples
    
    @param y: weights 
    @precond: np array shape 1*m the true values corresponding to X i.e. X(:, i) --> y(i)
    """
    m = X.shape[1]
    
    #Forward prop
    z = np.dot(np.transpose(w), X) + b # z: [1*m] matrix
    y_hat = sigmoid(z) #y_hat: [1*m] matrix
    cost = (-1/m) * np.sum(y * np.log(y_hat) + (1-y) * np.log(1-y_hat)) #number
    cost = np.squeeze(cost)
    #Backward prop
    dw = (1/m) * np.dot(X,np.transpose(y_hat - y))
    db = (1/m) * np.sum(y_hat - y)
    
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost
    

In [94]:
w, b, X, Y = np.array([[1.],[2.]]), 2., np.array([[1.,2.,-1.],[3.,4.,-3.2]]), np.array([[1,0,1]])
grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))

dw = [[0.99845601]
 [2.39507239]]
db = 0.001455578136784208
cost = 5.801545319394553


In [None]:
# GRADED FUNCTION: optimize

def optimize(w, b, X, y, epochs, learning_rate, print_cost = False):
    """
    Optimizes w, b using gradient descent
    
    Arguments:
    @param w: weights
    @precond: np array shape n * 1

    @param b: bias 
    @precond: -- float
    
    @param X - the features
    @precond: np array shape (n, m)
    
    @param y - the true labels
    @precond: np array (1, m) containing 1 if spam, 0 if ham.

    @param epochs
    @precond: integer >= 1
    
    @param learning_rate - lr for gradient descent
    @precond: integer > 0
    
    
    Return:
    params: dictionary containing the weights w and bias b
    grads -- dictionary containing dw and db (the gradients)
    costs -- list of costs produced every 100 iterations.
    """
    
    costs = []
    
    for i in range(num_iterations):
        
        grads, cost = propagate(w, b, X, y)
        ### END CODE HERE ###
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule (≈ 2 lines of code)
        ### START CODE HERE ###
        w = w - learning_rate*dw
        b = b - learning_rate*db
        ### END CODE HERE ###
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs