In [None]:
import numpy as np
from scipy import sparse
from collections import Counter
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd

Collect stop words

In [None]:
stopwords = set(stopwords.words("english"))

## Data loader
Define function to load document id, raw text and labels from the input csv file. The input csv file (data/train.csv or data/test.csv) has the following 3 columns:

1. id: document id
2. text: document raw text
3. label: document label (data/train.csv: one of the values in {1,2,3,4,5}; data/test.csv: -1)

In [1]:
def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of documents, a list of labels
    """
    df = pd.read_csv(file_name)

    return df['id'], df["text"], df['label']

Define function to load document labels from the input csv file. The input csv file (data/answer.csv) has the following 2 columns:

1. id: document id
2. label: document label (one of the values in {1,2,3,4,5})

In [None]:
def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)['label']

## Feature Extractor
Define tokenization function.

In [None]:
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

Define function for filtering stop words.

In [None]:
def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

Define function for building the Bag Of Word (BOW) representations of documents.

Documentation of scipy lil matrix: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.lil_matrix.html

Documentation of scipy csr matrix: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

In [2]:
def get_bagofwords(data, vocab_dict):
    '''
    :param data: a list of tokenized documents, type: list
    :param vocab_dict: a mapping from words to indices, type: dict
    return a BOW matrix in Compressed Sparse Row matrix format, type: scipy.sparse.csr_matrix
    '''
    
    '''
    The BOW matrix is first constructed using Row-based list of lists sparse matrix (LIL) format.
    LIL is a convenient format for constructing sparse matrices, as it supports flexible slicing, 
    and it is efficient to change to the matrix sparsity structure.
    '''
    
    data_matrix = sparse.lil_matrix((len(data), len(vocab_dict)))

    for i, doc in enumerate(data):
        for word in doc:
            word_idx = vocab_dict.get(word, -1)
            if word_idx != -1:
                data_matrix[i, word_idx] += 1
                
    '''
    After constructing the BOW matrix on all input documents, we convert the matrix to Compressed Sparse 
    Row (CSR) format for fast arithmetic and matrix vector operations.
    '''
    data_matrix = data_matrix.tocsr()
    
    return data_matrix

## Data pre-processing
Load document ids, raw texts, and labels from the train and test sets.

In [None]:
train_file = "data/train.csv"
test_file = "data/test.csv"
ans_file = "data/answer.csv"


train_ids, train_texts, train_labels = load_data(train_file)
test_ids, test_texts, _ = load_data(test_file)
test_labels = load_labels(ans_file)

In [None]:
print("Size of train set: {}".format(len(train_ids)))
print("Size of test set: {}".format(len(test_ids)))

Tokenize the raw texts in the train and test sets.

In [None]:
train_tokens = [tokenize(text) for text in train_texts] 
test_tokens = [tokenize(text) for text in test_texts]

Remove stop words from the tokenized texts.

In [None]:
train_tokens = [filter_stopwords(tokens) for tokens in train_tokens]
test_tokens = [filter_stopwords(tokens) for tokens in test_tokens]

Build a vocabulary (i.e., a mapping from words to indices) on the train set.

In [None]:
# use a set data structure to hold all words appearing in the train set
vocab = set()

for i, doc in enumerate(train_tokens):# enumerate over each document in the train set
    # enumerate over each word in the document
    for word in doc:
        # if this word has been added into the set before, 
        # then it will be ignored, otherwise, it will be 
        # added into the set.
        vocab.add(word)
        
# create a dictionary from the set of words, where the
# keys are word strings and the values are numerical indices
vocab_dict = dict(zip(vocab, range(len(vocab))))

In [None]:
print('Size of vocab: ', len(vocab_dict))

Build the BOW matrices from the tokenized texts in train and test sets respectively, using the vocabulary and the get_bagofwords function defined above

In [None]:
train_data_matrix = get_bagofwords(train_tokens, vocab_dict)
test_data_matrix = get_bagofwords(test_tokens, vocab_dict)

In [None]:
print('Type of train_data_matrix: ', type(train_data_matrix))
print('Type of test_data_matrix: ', type(test_data_matrix))
print('Shape of train_data_matrix:', train_data_matrix.shape)
print('Shape of test_data_matrix:', test_data_matrix.shape)

## Naive Bayes
Define the following symbols:

N_train = size of the train set

N_test = size of the test set

V = vocabulary size

K = number of classes

All indices of tensors are 0-based

In [None]:
# get the size of the train set 
N_train = train_data_matrix.shape[0]

# get the size of the test set 
N_test = test_data_matrix.shape[0]

# get the vocabulary size
V = len(vocab_dict)

# get the number of classes
K = max(train_labels)

print('N_train: ', N_train)
print('N_test: ', N_test)
print('V: ', V)
print('K: ', K)

Define a utility function to normalize (with/without laplace smoothing) an input tensor over the first dimension.

In [5]:
def normalize(P, smoothing_prior=0):
    """
    e.g.
    Input: [1,2,1,2,4]
    Output: [0.1,0.2,0.1,0.2,0.4] (without laplace smoothing) or 
    [0.1333,0.2,0.1333,0.2,0.3333] (with laplace smoothing and the smoothing prior is 1)
    """
    
    # get the size of the first dimension
    N = P.shape[0]
    
    # sum the tensor over the first dimension
    # setting axis = 0 means the summation is performed over the first dimension
    # setting keepdims=True means the reduced axes (i.e., the 0-th axis this case) 
    # are left in the result as dimensions with size one. With this option, the 
    # result will broadcast correctly against the input array.
    
    norm = np.sum(P, axis=0, keepdims=True)
    
    # perform the normalization by dividing the input tensor by the norm,
    # and add smoothing prior in both the numerator and the denominator.
    return (P + smoothing_prior) / (norm + smoothing_prior*N)

Define a utility function to compute the accuracy score given the ground truth labels and predictions.

In [7]:
def evaluate(y_true, y_pre):
    acc = accuracy_score(y_true, y_pre)
    return acc

## Training
Given:

1. the training labels (1-d array of shape (N_train,));

2. the BOW matrix of training documents (scipy.sparse.csr_matrix of shape (N_train,V)),

the training of Naive Bayes classifier is to compute the following two probabilities:

1. prior: P(y) (an 1-d array with shape (K,), where the entry at position [l] is the is the prior probability of label l+1);

2. likelihood: P(x|y) (a matrix with shape (V,K), where the entry at position [i,l] is the probability of word i in the documents of label l+1)

In [None]:
# create a matrix with shape (N_train,K), where the entry at
# the position (i,j) is 1  
# iff the (i+1)-th document belongs to (j+1)-th 
# class, otherwise it is 0

data_label_onehot_matrix = np.zeros((N_train, K))

for i, l in enumerate(train_labels):
    # the (i+1)-th document has label l, so we 
    # set the entry at the position [i,l-1] to 
    # be 1
    data_label_onehot_matrix[i, l-1] = 1

In [None]:
print('data_label_onehot_matrix.shape: ', data_label_onehot_matrix.shape)

Compute the frequencies of all labels in the train set by row-wise summation.

Set axis = 0 so that the summation is across rows of the data_label_onehot_matrix.

Set keepdims = False so that we can get an 1-d array of shape (K,) after the summation.

In [None]:
label_freq = np.sum(data_label_onehot_matrix, axis=0, keepdims=False)

Compute P(y) by normalizing the label frequencies with laplace smoothing, where the smoothing prior = 1.

In [None]:
P_y = normalize(label_freq, smoothing_prior=1)

Build a matrix word_freq of shape (V,K), where word_freq[i,j] is the frequency of word i in the documents of label (j+1).

In [9]:
word_freq = train_data_matrix.transpose().dot(data_label_onehot_matrix)

NameError: name 'train_data_matrix' is not defined

word_freq[i,j] = the dot product of the following 2 vectors:

1. The i-th row of train_data_matrix.transpose():

2. The j-th column of data_label_onehot_matrix

The i-th row of train_data_matrix.transpose() is the frequncies of word i in all documents in the train set (i.e., train_data_matrix.transpose()[i,k] is the frequency of word i in (k+1)-th document).

The j-th column of data_label_onehot_matrix is a vector indicating whether each document in the train set has label (j+1) (i.e., data_label_onehot_matrix[k,j] = 1 if the (k+1)-th document has label (j+1), otherwise it is data_label_onehot_matrix[k,j] = 0)

So the dot product of these two vectors is to sum over the frequencies of word i in all the train documents of label (j+1), which is the frequency of word i in the documents of label (j+1).

Normalize the word_freq matrix over the rows (i.e., across all words in the vocabulary for each label) to get P(x|y) (a matrix with shape (V,K), where the entry at position [i,l] is the probability of word i in the documents of label l+1). The normalization is with laplace smoothing, where the smoothing prior = 1.

In [None]:
P_xy = normalize(word_freq,smoothing_prior=1)

In [None]:
train_log_P_dy = train_data_matrix.dot(log_P_xy)
test_log_P_dy = test_data_matrix.dot(log_P_xy)

In [None]:
train_log_P = log_P_y + train_log_P_dy
test_log_P = log_P_y + test_log_P_dy

In [None]:
# we add 1 because labels strat from 1
train_pred = np.argmax(train_log_P, axis=1) + 1
test_pred = np.argmax(test_log_P, axis=1) + 1

## Evaluation

In [None]:
train_acc= evaluate(train_labels, train_pred)
print("Train Accuracy: {}".format(train_acc))

test_acc= evaluate(test_labels, test_pred)
print("Test Accuracy: {}".format(test_acc))