In [1]:
import numpy as np
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer 
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

In [2]:
def ngram(txt, n, ngrams):
    
    for i in range(0,len(txt)-n+1):
        ngram = txt[i:i+n]
        ngrams.append(" ".join(ngram))

    return ngrams

In [433]:
def tokenizeArticle(filename):
    
    
    # load text
    file = open(filename, 'rt')
    text = file.read()
    file.close()

    # Tokenize text string
    #word_tokens = word_tokenize(text) 
    word_tokens = re.findall(r"[A-Za-z '-]",text)
    
    # Remove all punctunation and numbers
    Words = [word.lower() for word in word_tokens]

    words = []
    for i in range(3,6):
        w = ngram(Words,i,words)
    
    return words

In [434]:
# Initialize variables to store author for each article and word tokenization matrix
authorCount = 0
authorID = []
articleDB = []

# Loop through each author directory within training set
dataDir = os.listdir('data/C50train')
for dDir in dataDir:

    # Loop through each article within author directory
    authorDir = os.listdir('data/C50train/' + dDir)
    for aDir in authorDir:

        # Tokenize each article and store author ID
        articleDB.append(tokenizeArticle('data/C50train/' + dDir + '/' + aDir))
        authorID.append(authorCount)
        
    # Increment author id for next author
    authorCount = authorCount + 1

# Loop through each author directory within testing set
authorCount = 0
dataDir = os.listdir('data/C50test')
for dDir in dataDir:

    # Loop through each article within author directory
    authorDir = os.listdir('data/C50test/' + dDir)
    for aDir in authorDir:

        # Tokenize each article and store author ID
        articleDB.append(tokenizeArticle('data/C50test/' + dDir + '/' + aDir))
        authorID.append(authorCount)
        
    # Increment author id for next author
    authorCount = authorCount + 1

In [None]:
print(articleDB[0])

In [507]:
def filterLen(DF, docs, val1, val2):
    r""" filter out terms that are too short. 
    docs is a list of lists, each inner list is a document represented as a list of words
    minlen is the minimum length of the word to keep
    """
    return [ [t for t in d if val2 <= DF[t] <= val1] for d in docs ]

df = defaultdict(int)
for d in articleDB:
    for w in set(d):
        df[w] += 1

DB = filterLen(df, articleDB, 5000, 1000)

In [474]:
from collections import Counter
from scipy.sparse import csr_matrix
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [521]:
mat = build_matrix(DB)
csr_info(mat)

 [nrows 5000, ncols 3000, nnz 8243491]


In [522]:
nrows = mat.shape[0]
nnz = mat.nnz
ind, val, ptr = mat.indices, mat.data, mat.indptr
# document frequency
df = defaultdict(int)
for i in ind:
    df[i] += 1

# inverse document frequency
for k,v in df.items():
    df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    
# scale by idf
for i in range(0, nnz):
    val[i] *= df[ind[i]]

In [523]:
nrows = mat.shape[0]
nnz = mat.nnz
ind, val, ptr = mat.indices, mat.data, mat.indptr
for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum

In [524]:
from sklearn.model_selection import train_test_split

# Split the train data into X_train and y_train datasets in 80:20 ratio.
X_train, X_test, y_train, y_test = train_test_split(
    mat, authorID, test_size=0.2, random_state=42)

In [525]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='rbf',C=10)
svclassifier.fit(X_train, y_train)
pr = svclassifier.predict(X_test)

errors = (y_test != pr).sum()
total = X_test.shape[0]
error_rate_without_dr = (errors/float(total)) * 100
print("Error rate without dimensionality reduction: %d/%d * 100 = %f" % (errors, total, error_rate_without_dr))

Error rate without dimensionality reduction: 176/1000 * 100 = 17.600000


In [492]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print('precision score = ', precision_score(y_test, pr, average = 'weighted'))
print('recall score = ', recall_score(y_test, pr, average = 'weighted'))
print('f1 score = ', f1_score(y_test, pr, average = 'weighted'))

precision score =  0.8312432357854997
recall score =  0.827
f1 score =  0.8248527274873307


In [501]:
from sklearn.model_selection import KFold, cross_val_score
svc = SVC(kernel='rbf',C=10)
k_fold = KFold(n_splits=10, shuffle=True, random_state=20)


In [502]:
print(k_fold)

KFold(n_splits=10, random_state=20, shuffle=True)


In [503]:
cross_val_score(svc, mat, authorID, cv=k_fold, n_jobs=-1, scoring='f1_weighted')

array([0.87495964, 0.84416594, 0.81724579, 0.82454249, 0.85847478,
       0.83693481, 0.86419569, 0.84608703, 0.86882243, 0.85784559])