In [35]:
import time
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score

# read in the dataset
train = pd.read_csv(
    'train.dat', 
     sep='delimiter', header=None, engine ='python')

test = pd.read_csv(
    'test.dat', 
     sep='delimiter', header=None, engine ='python')

# separate names from classes
train_vals = train.iloc[:,:].values
train_docs = [n[0][2:] for n in train_vals]
train_cls = [n[0][0] for n in train_vals]

test_vals = test.iloc[:,:].values
test_docs = [n[0][0:] for n in test_vals]
test_cls = [] * len(test_docs)

# corpus = pd.DataFrame({'doc': train_docs, 'class': train_cls})

In [36]:
def cmer(doc, c=3):
    r""" Given a name and parameter c, return the vector of c-mers associated with the doc
    """
    doc = doc.lower()
    if len(doc) < c:
        return [doc]
    v = []
    for i in range(len(doc)-c+1):
        v.append(doc[i:(i+c)])
    return v

In [37]:
def build_matrix(docs, idx):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    nnz = 0
    for d in docs:
        nnz += len(list(w for w in d if w in idx))
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common() if k in idx)
        l = len(keys)
        for j,k in enumerate(keys):
            if k in idx:
                ind[j+n] = idx[k]
                val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat
    
def build_idx(train_mat):
    r""" Build a mapping from word to ID and vice versa. 
    """
    idx = {}
    tid = 0
    for d in train_mat:
        for w in d:
            w.lower()
            if w not in idx:
                idx[w] = tid
                tid += 1
    return idx
    
    #get rid of build both, just idx of train set!!
def textToMatrix(train_docs, test_docs, c=3):
    train_mat = [cmer(l,c) for l in train_docs]
    test_mat = [cmer(l,c) for l in test_docs]
    return train_mat, test_mat

In [38]:
train_mat, test_mat = textToMatrix(train_docs, test_docs, 6)
idx = build_idx(train_mat)

train_mat = csr_l2normalize(build_matrix(train_mat, idx), copy=True)
test_mat = csr_l2normalize(build_matrix(test_mat, idx), copy=True)

csr_info(train_mat, "train_mat")
csr_info(test_mat, "test_mat")

train_mat [nrows 102080, ncols 1335522, nnz 22120884]
test_mat [nrows 25520, ncols 1335522, nnz 5379210]


In [39]:
def classify(x, train, clstr, k=3):
    r""" Classify vector x using kNN and majority vote rule given training data and associated classes
    """
    # find nearest neighbors for x
    dots = x.dot(train.T)
    sims = list(zip(dots.indices, dots.data))
    if len(sims) == 0:
        # could not find any neighbors
        return '+' if np.random.rand() > 0.5 else '-'
    sims.sort(key=lambda x: x[1], reverse=True)
    tc = Counter(clstr[s[0]] for s in sims[:k]).most_common(2)
    if len(tc) < 2 or tc[0][1] > tc[1][1]:
        # majority vote
        return tc[0][0]
    # tie break
    tc = defaultdict(float)
    for s in sims[:k]:
        tc[clstr[s[0]]] += s[1]
    return sorted(tc.items(), key=lambda x: x[1], reverse=True)[0][0]

In [40]:
output_file = open('output.dat', 'w', newline='')
test_cls = [ classify(test_mat[i,:], train_mat, train_cls, 9) for i in range(test_mat.shape[0]) ]
predictions = pd.Series(test_cls)
predictions.to_csv(output_file, index=False, header=None)

predictions

0        4
1        1
2        1
3        3
4        4
        ..
25515    3
25516    2
25517    1
25518    4
25519    3
Length: 25520, dtype: object

In [41]:
yaya = open('yaya.dat', 'w+', newline='')
predictions.to_csv(yaya, index=False, header=None)

In [43]:
with open('yayaya.dat', 'w') as f:
    for item in test_cls:
        f.write(f"{item}\n")