In [1]:
#pip install cleantext #I think I only need to do this once, is that correct?

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import cleantext

If our first author-independent goal is to build a giant vector of possible words from training set, then maybe the correct approach is to sequentially read through all of the different word files and make a set or dictionary of all terms we encounter.  If we build a dictionary, then the word itself can be the key, and each time the word is encuontered the value is updated by 1.

The advantage of doing a dictionary like that would be that we could do a dimensionality reduction on those elements that have entries across many different authors.  We might not want to get rid of text entries that are common for one single author.  So, perhaps the value of each dictionary key would be a 50-tuple with the number of times each word appeared for each author.

In [11]:
#These functions are from the class activity nn-classication
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs) #a "doc" here is a list of words.  
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

def csr_info(mat, name="", non_empty=False):
    r""" Print out info about this CSR matrix. If non_empty, 
    report the number of non-empty rows and cols as well
    """
    if non_empty:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat
        
def namesToMatrix(names, c):
    docs = [cmer(n, c) for n in names]
    return build_matrix(docs)

In [3]:
import glob
txtfiles = []
#for file in glob.glob("authorship_data/C50/C50train/AaronPressman/*.txt"):
for file in glob.glob("authorship_data/C50/C50train/*/*.txt"):
    txtfiles.append(file)
print(txtfiles)

['authorship_data/C50/C50train\\AaronPressman\\106247newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\120600newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\120683newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\136958newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\137498newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\14014newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\156814newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\182596newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\186392newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\193495newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\196805newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\197734newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\206838newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\231479newsML.txt', 'authorship_data/C50/C50train\\AaronPressman\\233150newsML.txt', 'authorship_data/C50/C50t

In [4]:
#this is a test of the sparse matrix generation.  I would like to run it on cleaned up lists of words in each document 

In [10]:

allWords=set([''])
# read in the dataset
for tf in txtfiles:
    #with open("authorship_data/C50/C50train/AaronPressman/2537newsMl.txt", "r", encoding="utf8") as fh: 
    with open(tf, "r", encoding="utf8") as fh: 
        lines = fh.readlines()
        for l in lines:
            #tempWordSet=set(l.split(" "))
            tempWordSet=set(cleantext.clean_words(l, all=True))  #test
            allWords = allWords.union(tempWordSet)
sortedWords = list(allWords)
sortedWords.sort()
sortedWords=sortedWords[1:]
print(len(sortedWords))

21967


In [9]:
print(sortedWords)

['', 'aa', 'aaa', 'aah', 'aampa', 'aaron', 'aart', 'ab', 'aba', 'abadon', 'abandon', 'abat', 'abb', 'abbey', 'abbott', 'abbrevi', 'abc', 'abcd', 'abdelkad', 'abdic', 'abdidjan', 'abduct', 'abdulaziz', 'abdureschit', 'abel', 'abelardo', 'abercrombi', 'aberdeen', 'abettor', 'abf', 'abhishek', 'abhor', 'abhorr', 'abi', 'abid', 'abidjan', 'abidjanbas', 'abil', 'abilityto', 'abillion', 'abishek', 'abject', 'abl', 'ablait', 'ablaz', 'abn', 'abnamro', 'abnom', 'abnorm', 'abnorn', 'aboard', 'abod', 'abolish', 'abolit', 'abominablei', 'aboput', 'aborigin', 'abort', 'abound', 'aboutar', 'aboutfac', 'aboutturn', 'aboveaverag', 'aboveboard', 'abovebudget', 'aboveground', 'aboveinfl', 'abovemarket', 'abraham', 'abril', 'abroad', 'abrupt', 'abruptli', 'abseil', 'absenc', 'absent', 'absentit', 'absolut', 'absorb', 'abstain', 'abstrus', 'absurd', 'abu', 'abund', 'abundantli', 'abus', 'abut', 'abuzz', 'abysm', 'ac', 'acacia', 'academ', 'academi', 'academia', 'acapulco', 'acc', 'accc', 'acccess', 'accel

In [11]:
print(lines[0])


A break-in at the U.S. Justice Department's World Wide Web site last week highlighted the Internet's continued vulnerability to hackers.

