## Import Needed Libraries

In [1]:
import numpy as np
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
from scipy.sparse import csr_matrix
import os
import re
import time
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Define Functions to Produce Character N-grams 

In [2]:
def ngram(txt, n, ngrams):
    r""" Takes in a list of characters and combines them into character
    n-grams of specified length
    """
    
    for i in range(0,len(txt)-n+1):
        ngram = txt[i:i+n]
        ngrams.append(" ".join(ngram))

    return ngrams

In [3]:
def tokenizeArticle(filename, ngramLen):
    r""" Takes in file and outputs a set of character n-grams
    """
    
    # load text
    file = open(filename, 'rt')
    text = file.read()
    file.close()

    # Extract characters from text (alphabetical, ' and -)
    word_tokens = re.findall(r"[A-Za-z '-]",text)
    
    # Convert all characters to lowercase
    Words = [word.lower() for word in word_tokens]

    # Extract character n-grams
    words = []
    for n in ngramLen:
        w = ngram(Words,n,words)
    
    return words

## Define Functions to Reduce and Normalize Data 

In [4]:
def filterSize(DB, arraySize):
    r""" Filters dataset down to the N most frequent character n-grams
    """
    
    # Calculate Document Frequency
    df = defaultdict(int)
    for d in DB:
        for w in set(d):
            df[w] += 1
    
    # Determine word of the most common occurrence
    count = 1
    dnew = defaultdict(int)
    for key in sorted(df.items(), key=lambda item: item[1], reverse=True):
        dnew[key[0]] = count
        count = count + 1
    
    return [ [t for t in d if dnew[t] <= arraySize] for d in DB ]

In [5]:
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [6]:
def td_idf(dbMat):
    r""" Normalizes the sparse matrix based on Term Frequency, Inverse Document
    Frequency normalization
    """
    # Get Data from Sparse Matrix
    nrows = dbMat.shape[0]
    nnz = dbMat.nnz
    ind, val, ptr = dbMat.indices, dbMat.data, dbMat.indptr
    
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1

    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
    
    return dbMat

In [7]:
def l2norm(dbMat):
    r""" Normalizes the sparse matrix based on L2 Normalization
    """
    
    # Get Data from Sparse Matrix
    nrows = dbMat.shape[0]
    nnz = dbMat.nnz
    ind, val, ptr = dbMat.indices, dbMat.data, dbMat.indptr
    
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
    
    return dbMat

## Convert Articles into Charcter N-grams 

In [8]:
# Initialize variables to store author for each article and word tokenization matrix
authorCount = 0
authorID = []
articleDB = []

# Loop through each author directory within training set
dataDir = os.listdir('data/C50train')
for dDir in dataDir:

    # Loop through each article within author directory
    authorDir = os.listdir('data/C50train/' + dDir)
    for aDir in authorDir:

        # Tokenize each article and store author ID
        articleDB.append(tokenizeArticle('data/C50train/' + dDir + '/' + aDir, [5]))
        authorID.append(authorCount)
        
    # Increment author id for next author
    authorCount = authorCount + 1

# Loop through each author directory within testing set
authorCount = 0
dataDir = os.listdir('data/C50test')
for dDir in dataDir:

    # Loop through each article within author directory
    authorDir = os.listdir('data/C50test/' + dDir)
    for aDir in authorDir:

        # Tokenize each article and store author ID
        articleDB.append(tokenizeArticle('data/C50test/' + dDir + '/' + aDir, [5]))
        authorID.append(authorCount)
        
    # Increment author id for next author
    authorCount = authorCount + 1

In [9]:
# Display character n-grams from first document
print(articleDB[0])

['t h e   i', 'h e   i n', 'e   i n t', '  i n t e', 'i n t e r', 'n t e r n', 't e r n e', 'e r n e t', 'r n e t  ', 'n e t   m', 'e t   m a', 't   m a y', '  m a y  ', 'm a y   b', 'a y   b e', 'y   b e  ', '  b e   o', 'b e   o v', 'e   o v e', '  o v e r', 'o v e r f', 'v e r f l', 'e r f l o', 'r f l o w', 'f l o w i', 'l o w i n', 'o w i n g', 'w i n g  ', 'i n g   w', 'n g   w i', 'g   w i t', '  w i t h', 'w i t h  ', 'i t h   n', 't h   n e', 'h   n e w', '  n e w  ', 'n e w   t', 'e w   t e', 'w   t e c', '  t e c h', 't e c h n', 'e c h n o', 'c h n o l', 'h n o l o', 'n o l o g', 'o l o g y', 'l o g y  ', 'o g y   b', 'g y   b u', 'y   b u t', '  b u t  ', 'b u t   c', 'u t   c r', 't   c r i', '  c r i m', 'c r i m e', 'r i m e  ', 'i m e   i', 'm e   i n', 'e   i n  ', '  i n   c', 'i n   c y', 'n   c y b', '  c y b e', 'c y b e r', 'y b e r s', 'b e r s p', 'e r s p a', 'r s p a c', 's p a c e', 'p a c e  ', 'a c e   i', 'c e   i s', 'e   i s  ', '  i s   s', 'i s   s t'

##  Reduce and Normalize Data

In [10]:
# Down select features
DB = filterSize(articleDB, 7000)
   
# Create CSR matrix
mat = build_matrix(DB)
csr_info(mat)
    
# Normalize Data
mat = td_idf(mat)
mat = l2norm(mat)

 [nrows 5000, ncols 7000, nnz 5732172]


## Classify using SVM 

In [11]:
# Construct SVM and K-folds
svc = SVC(kernel='rbf', C=10)
k_fold = KFold(n_splits=10, shuffle=True, random_state=20)
    
# Run Cross Validation
f1_sc = cross_val_score(svc, mat, authorID, cv=k_fold, n_jobs=-1, scoring='f1_weighted')
print('F1 Score for each Fold:', f1_sc)
print('Average F1 Score: ', np.mean(f1_sc))

F1 Score for each Fold: [0.89050891 0.85098474 0.80616118 0.81690668 0.85325067 0.83738318
 0.85267429 0.83759586 0.86746834 0.85187126]
Average F1 Score:  0.8464805097178731
