In [1]:
import numpy as np
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

In [2]:
def tokenizeArticle(filename):
    
    
    # load text
    file = open(filename, 'rt')
    text = file.read()
    file.close()

    # Tokenize text string
    word_tokens = word_tokenize(text) 

    # Remove all punctunation and numbers
    words = [word.lower() for word in word_tokens if word.isalpha()]

    # Remove stop words
    stop_words = set(stopwords.words('english')) 
    words = [w for w in words if not w in stop_words] 

    # Stem all words
    ps = PorterStemmer()
    words = [ps.stem(w) for w in words] 
    
    return words

In [3]:
# Initialize variables to store author for each article and word tokenization matrix
authorCount = 0
authorID = []
articleDB = []

# Loop through each author directory within training set
dataDir = os.listdir('data/C50train')
for dDir in dataDir:
    
    # Loop through each article within author directory
    authorDir = os.listdir('data/C50train/' + dDir)
    for aDir in authorDir:
        
        # Tokenize each article and store author ID
        articleDB.append(tokenizeArticle('data/C50train/' + dDir + '/' + aDir))
        authorID.append(authorCount)
        
    # Increment author id for next author
    authorCount = authorCount + 1

#len(articleDB)
#print(articleDB[1])
#print(authorID)

In [4]:
print(articleDB[0])
print(articleDB[2499])

['internet', 'may', 'overflow', 'new', 'technolog', 'crime', 'cyberspac', 'still', 'varieti', 'nation', 'consum', 'leagu', 'said', 'wednesday', 'popular', 'scam', 'internet', 'pyramid', 'scheme', 'earli', 'investor', 'bogu', 'fund', 'paid', 'deposit', 'later', 'investor', 'leagu', 'consum', 'advocaci', 'group', 'track', 'web', 'scam', 'site', 'set', 'world', 'wide', 'web', 'februari', 'call', 'internet', 'fraud', 'watch', 'http', 'site', 'collect', 'report', 'directli', 'consum', 'wide', 'prais', 'law', 'enforc', 'agenc', 'consum', 'suspect', 'scam', 'internet', 'critic', 'inform', 'said', 'jodi', 'bernstein', 'director', 'feder', 'trade', 'commiss', 'bureau', 'consum', 'protect', 'internet', 'fraud', 'watch', 'major', 'help', 'ftc', 'identifi', 'particular', 'scam', 'infanc', 'may', 'exampl', 'commiss', 'use', 'internet', 'report', 'shut', 'site', 'run', 'fortuna', 'allianc', 'taken', 'million', 'promis', 'investor', 'could', 'earn', 'month', 'initi', 'deposit', 'instead', 'fortuna', 

In [5]:
from collections import Counter
from scipy.sparse import csr_matrix
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [6]:
mat = build_matrix(articleDB)
csr_info(mat)
print(mat[0][:].toarray())

 [nrows 2500, ncols 18101, nnz 437259]
[[9. 3. 1. ... 0. 0. 0.]]


In [7]:
nrows = mat.shape[0]
nnz = mat.nnz
ind, val, ptr = mat.indices, mat.data, mat.indptr
# document frequency
df = defaultdict(int)
for i in ind:
    df[i] += 1

# inverse document frequency
for k,v in df.items():
    df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
# scale by idf
for i in range(0, nnz):
    val[i] *= df[ind[i]]

In [8]:
nrows = mat.shape[0]
nnz = mat.nnz
ind, val, ptr = mat.indices, mat.data, mat.indptr
for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum