In [43]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import *
from nltk.tokenize import RegexpTokenizer
import random
from collections import OrderedDict
import torch

In [44]:
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

In [None]:
def load_data():
    spam_paths = np.array([])
    ham_paths = np.array([])
    for subdir, dirs, files in os.walk('data'):
        for file in files:
            if file != 'Summary.txt' and file[-4:] == '.txt':
                file_path = os.path.join(subdir, file)
                if file_path[-8:] == 'spam.txt':
#                     spam_paths.append(file_path)
                    spam_paths = np.append(spam_paths, file_path)
                elif file_path[-7:] == 'ham.txt':
#                     ham_paths.append(file_path)
                    ham_paths = np.append(ham_paths, file_path)
                    
    dataset = [(path,1) for path in spam_paths] + [(path,0) for path in ham_paths]
    random.shuffle(dataset)
    
    train_ratio = 0.8
    validation_ratio = 0.1
    num_train = int(train_ratio * len(dataset))
    num_validation = int(validation_ratio * len(dataset))
    
    train_set = dataset[:num_train]
    validation_set = dataset[num_train : num_train + num_validation]
    test_set = dataset[num_train + num_validation:]
    
    train_filePaths_X, train_y = zip(*train_set)
    validation_filePaths_X, validation_y = zip(*validation_set)
    test_filePaths_X, test_y = zip(*test_set)
    
    
    print(f'Number of files: {len(spam_paths)+len(ham_paths)}')
    print(f'Number of spam: {len(spam_paths)}')
    print(f'Number of ham: {len(ham_paths)}')
    print(f"Size of train_set: {len(train_set)}")
    print(f"Size of validation_set: {len(validation_set)}")
    print(f"Size of test_set: {len(test_set)}")
    
    return train_filePaths_X, train_y, validation_filePaths_X, validation_y, test_filePaths_X, test_y

In [163]:
def iter_dataset(dataset_X):
    
    for filePath in dataset_X:
        file = open(filePath, 'r', errors='ignore')
        yield file
        file.close()
        

In [164]:
def get_tfs(file):
    result = {}
    num_words = 0
#     print(file)
    for line in file:
#         print(line)
        words = tokenizer.tokenize(line)
        words = [stemmer.stem(word) for word in words]
        for word in words:
            if word in result.keys():
                result[word] += 1
            else:
                result[word] = 1
                
            num_words += 1
            
    for key in result:
        result[key] = result[key] / num_words
    
    return result

In [165]:
def get_words_with_largest_tfidfs(d, num_features):
    """
    Return num_features words with largest tf_idfs in dictionary d
    
    #HAVE TO DECIDE WHAT TO DO IF LESS THAN NUM_FEATURES FEATURES PRESENT
    
    @param d: the dictionary containing tf-idfs
    @param num_features: integer>0 of the number of features to get
    """
    result = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}
    return list(result.keys())[-num_features:]

In [166]:
def get_idfs(corpus):
    """
    Calculates the idf for every term in the document
    """
    DF = {} #dictionary --> {word: set() of documents in which word present} key:word, value: set
    num_documents = len(corpus)
    for file in iter_dataset(corpus):
#         print(file)
        for line in file:
#             print(line)
            words = tokenizer.tokenize(line)
            words = [stemmer.stem(word) for word in words]
            for word in words:
                try:
                    DF[word].add(file)
                except:
                    DF[word] = {file}
    
    for key in DF.keys():
#         print(len(DF[key]))
        DF[key] = np.log(num_documents/(len(DF[key])+1))
    
    return DF
                    
            

In [167]:
def get_feature_matrix(corpus, idfs, num_features):
    """
    calculates the tf_idf for each word for each file in the corpus
    
    Returns the feature matrix - top num_features words by tf-idf for each file - [num_features * len(corpus)] numpy array
    
    
    @param corpus: np Array [1*m] of filePaths
    @param idfs: the idfs dictionary for all words in the corpus
    @param num_features: the number of t to extract
    """
    #using for loop instead of vectorization because limit on how many files can be opened
    feature_matrix = np.array([])
    for file in iter_dataset(corpus):
        tfs = get_tfs(file)
        tfidfs = {}
    
        for key in tfs:
#             print(tfs[key])
            tfidfs[key] = tfs[key] * idfs[key]
            
        features_file = get_words_with_largest_tfidfs(tfidfs, num_features)
        
        feature_matrix = np.append(feature_matrix, np.transpose(features_file))

    return tfidfs



In [157]:
train_X_filePaths, train_y, validation_X_filePaths, validation_y, test_X_filePaths, test_y = load_data()

Number of files: 33716
Number of spam: 17171
Number of ham: 16545
Size of train_set: 26972
Size of validation_set: 3371
Size of test_set: 3373


In [159]:
idfs = get_idfs(train_X_filePaths+validation_X_filePaths+test_X_filePaths)

In [168]:
num_features = 20
train_X = get_feature_matrix(train_X_filePaths, idfs, num_features)
validation_X = get_feature_matrix(validation_X_filePaths, idfs, num_features)
test_X = get_feature_matrix(test_X_filePaths, idfs, num_features)

In [170]:
validation_X

{'subject': 0.0,
 'on': 0.005686066452973173,
 'the': 0.006144005175132828,
 'front': 0.016173318935663164,
 'page': 0.028456815709052567,
 'of': 0.006267230828937276,
 'googl': 0.10508519865386344,
 'for': 0.005199025062960624,
 'free': 0.009280013507806936,
 'hi': 0.008441180895267367,
 'john': 0.012060488893973235,
 'i': 0.0038009287631351504,
 'hope': 0.011354177068828557,
 'all': 0.00496093270475108,
 'is': 0.011875026931017468,
 'well': 0.009745191462983065,
 'weiguarante': 0.0403841518695329,
 'to': 0.0077084564880209394,
 'putiyour': 0.0403841518695329,
 'webisiteon': 0.0403841518695329,
 'firstipag': 0.11252406746884833,
 'ofi': 0.0403841518695329,
 'and': 0.010135952402010127,
 'we': 0.02332210295052366,
 'willido': 0.0403841518695329,
 'it': 0.007336691283563053,
 'forifre': 0.0403841518695329,
 'whi': 0.013321526150984543,
 'weibeliev': 0.0403841518695329,
 'that': 0.0037162881461659646,
 'you': 0.013177837613375987,
 'should': 0.007990376549942347,
 'see': 0.00768371890407

NameError: name 'train_X_filepaths' is not defined

In [76]:
def sigmoid(z):
    """ Returns sigmoid of z """
    return 1/(1+np.exp(-z))

In [114]:
def initialize_with_zeros(dim):
    """ Initialize w array with dim*1 zeros and b as 0"""
    w = np.zeros((dim))
    b = 0
    
    return w, b

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 0)

In [93]:
def propagate(w, b, X, y):
    """
    Apply forward and backward propagation.
    
    @param w: weights 
    @precond: np array shape n*1
    
    @param b: bias 
    @precond: float
    
    @param X: the features 
    @precond: np array shape n*m n is the number of features and m the number of training examples
    
    @param y: weights 
    @precond: np array shape 1*m the true values corresponding to X i.e. X(:, i) --> y(i)
    """
    m = X.shape[1]
    
    #Forward prop
    z = np.dot(np.transpose(w), X) + b # z: [1*m] matrix
    y_hat = sigmoid(z) #y_hat: [1*m] matrix
    cost = (-1/m) * np.sum(y * np.log(y_hat) + (1-y) * np.log(1-y_hat)) #number
    cost = np.squeeze(cost)
    
    #Backward prop
    dw = (1/m) * np.dot(X,np.transpose(y_hat - y))
    db = (1/m) * np.sum(y_hat - y)
    
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost
    

In [94]:
w, b, X, Y = np.array([[1.],[2.]]), 2., np.array([[1.,2.,-1.],[3.,4.,-3.2]]), np.array([[1,0,1]])
grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))

dw = [[0.99845601]
 [2.39507239]]
db = 0.001455578136784208
cost = 5.801545319394553


In [98]:
# GRADED FUNCTION: optimize

def optimize(w, b, X, y, epochs, learning_rate, print_cost = False):
    """
    Optimizes w, b using gradient descent
    
    Arguments:
    @param w: weights
    @precond: np array shape n * 1

    @param b: bias 
    @precond: -- float
    
    @param X - the features
    @precond: np array shape (n, m)
    
    @param y - the true labels
    @precond: np array (1, m) containing 1 if spam, 0 if ham.

    @param epochs
    @precond: integer >= 1
    
    @param learning_rate - lr for gradient descent
    @precond: integer > 0
    
    Return:
    params: dictionary containing the weights w and bias b
    grads: dictionary containing dw and db (the gradients)
    costs: list of costs produced every 100 iterations.
    """
    
    costs = []
    
    for i in range(epochs):
        
        grads, cost = propagate(w, b, X, y)

        dw = grads["dw"]
        db = grads["db"]
        
        w = w - learning_rate*dw
        b = b - learning_rate*db

        if i % 100 == 0:
            costs.append(cost)
        
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

In [99]:
params, grads, costs = optimize(w, b, X, Y, epochs= 100, learning_rate = 0.009, print_cost = False)

print ("w = " + str(params["w"]))
print ("b = " + str(params["b"]))
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))

w = [[0.19033591]
 [0.12259159]]
b = 1.9253598300845747
dw = [[0.67752042]
 [1.41625495]]
db = 0.21919450454067657


In [134]:
def predict(w, b, X):
    """
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
    
    Arguments:
    @param w: weights
    @precond: np array shape n * 1

    @param b: bias 
    @precond: -- float
    
    @param X - the features
    @precond: np array shape (n, m)
    
    Returns:
    y_preds -- a numpy array (vector) containing predictions (0/1)
    """
    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    
    y_hat = sigmoid(np.dot(np.transpose(w), X) + b)
    y_preds = (y_hat>0.5).astype(int)
    
    return y_preds

In [135]:
w = np.array([[0.1124579],[0.23106775]])
b = -0.3
X = np.array([[1.,-1.1,-3.2],[1.2,2.,0.1]])
print ("predictions = " + str(predict(w, b, X)))

predictions = [[1 1 0]]


In [138]:
def model(train_X, train_Y, validation_X, validation_y, test_X, test_Y,\
          epochs = 2000, learning_rate = 0.5, print_cost=True):
    """
    Builds the logistic regression model by calling the function you've implemented previously
    
    Arguments:
    @params train_X, train_Y, validation_X, validation_Y, test_X, test_Y: training X & Y, test X & Y
    @preconds: represented by a numpy array of shape (n, m_train), (1, m_train), (n, m_val), (1, m_val), 
    (n, m_test) and (1,m_test) respectively
    

    @param epochs: number of iterations to optimize the parameters
    @precond: integer > 0
    
    @param learning_rate -- the learning rate used in the update rule of optimize()
    @precond: float > 0
    
    Returns:
    d -- dictionary containing information about the model.
    """
    
    m_train = train_X.shape[1]
    print(m_train)
    w, b = initialize_with_zeros(m_train)

    parameters, grads, costs = optimize(w, b, X, y, epochs, learning_rate)
    
    w = parameters["w"]
    b = parameters["b"]
        
    train_preds = predict(w, b, X_train)
    validation_preds = predict(w, b, X_validation)
    test_preds = predict(w, b, X_test)

    print("train accuracy: {} %".format(100 - np.mean(np.abs(train_preds - train_y)) * 100))
    print("validation accuracy: {} %".format(100 - np.mean(np.abs(validation_preds - validation_y)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(test_preds - test_y)) * 100))

    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [139]:
d = model(train_X, train_y, validation_X, validation_y, test_X, test_y, epochs = 2000, learning_rate = 0.005, print_cost = True)

AttributeError: 'tuple' object has no attribute 'shape'

In [None]:
def train(idfs):
    for file in iter_dataset(train_X[:3]):
        tfidfs = get_tfidf(file, idfs)
        top20Words = [*tfidfs.keys()][-20:]
#         print(dict_stats(top20Words))
        print(top20Words)
        input()
        #pass it to the logistic regression unit
