In [23]:
import json
import numpy

In [24]:
def get_vocabulary(D):
    """
    Given a list of documents, where each document is represented as
    a list of tokens, return the resulting vocabulary. The vocabulary
    should be a set of tokens which appear more than once in the entire
    document collection plus the "<unk>" token.
    """
    lst = []
    token_dict = {}
    for document in D:
        for word in document:           
            token_dict[word] = token_dict.get(word, 0) + 1
    for key, value in token_dict.items():
        if value > 1:
            lst.append(key)
                
    lst.append("<unk>")
    voca_set = set(lst)
    
    return voca_set
        
    

In [25]:
class BBoWFeaturizer(object):
    def convert_document_to_feature_dictionary(self, doc, vocab):
        """
        Given a document represented as a list of tokens and the vocabulary
        as a set of tokens, compute the binary bag-of-words feature representation.
        This function should return a dictionary which maps from the name of the
        feature to the value of that feature.
        """
        # TODO
        token_dict = {}
        
        for word in doc:
            # if the token is in vocab
            if word in vocab:
                token_dict[word] = 1
            # if there is an unknown token
            else:
                token_dict["<unk>"] = 1
        
        return token_dict
                    

In [26]:
class CBoWFeaturizer(object):
    def convert_document_to_feature_dictionary(self, doc, vocab):
        """
        Given a document represented as a list of tokens and the vocabulary
        as a set of tokens, compute the count bag-of-words feature representation.
        This function should return a dictionary which maps from the name of the
        feature to the value of that feature.
        """
        # TODO
        token_dict = {}
        
        for word in doc:
            if word in vocab:
                token_dict[word] = token_dict.get(word, 0) + 1
            else:
                token_dict["<unk>"] = token_dict.get("<unk>", 0) + 1
                
        return token_dict
        

In [27]:
def compute_idf(D, vocab):
    """
    Given a list of documents D and the vocabulary as a set of tokens,
    where each document is represented as a list of tokens, return the IDF scores
    for every token in the vocab. The IDFs should be represented as a dictionary that
    maps from the token to the IDF value. If a token is not present in the
    vocab, it should be mapped to "<unk>".
    """
    # TODO
    D_len = len(D)
    idf_dict = {}
    
   
    # loop through all documents
    for d in D:
        token_seen = set()
        for token in d:
            if token in vocab:
                word = token
            else:
                word = '<unk>'
            if word not in token_seen:
                token_seen.add(word)
                idf_dict[word] = idf_dict.get(word, 0) + 1
    
    for token in idf_dict.keys():
        idf_dict[token] = numpy.log(D_len / idf_dict[token])
  
    return idf_dict
            
            
    
class TFIDFFeaturizer(object):
    def __init__(self, idf):
        """The idf scores computed via `compute_idf`."""
        self.idf = idf
    
    def convert_document_to_feature_dictionary(self, doc, vocab):
        """
        Given a document represented as a list of tokens and
        the vocabulary as a set of tokens, compute
        the TF-IDF feature representation. This function
        should return a dictionary which maps from the name of the
        feature to the value of that feature.
        """
        # TODO
        # compute tf
        tf_dict = {}        
        for word in doc:
            if word in vocab:
                tf_dict[word] = tf_dict.get(word, 0) + 1
            else:
                tf_dict["<unk>"] = tf_dict.get("<unk>", 0) + 1
                
        # tf_idf
        tf_idf = {}
        for token in tf_dict.keys():
            tf_idf[token] = tf_dict[token] * self.idf[token]
        
        
        return tf_idf
        
        

In [28]:
# You should not need to edit this cell
def load_dataset(file_path):
    D = []
    y = []
    with open(file_path, 'r') as f:
        for line in f:
            instance = json.loads(line)
            D.append(instance['document'])
            y.append(instance['label'])
    return D, y

def convert_to_features(D, featurizer, vocab):
    X = []
    for doc in D:
        X.append(featurizer.convert_document_to_feature_dictionary(doc, vocab))
    return X

In [29]:
def train_naive_bayes(X, y, k, vocab):
    """
    Computes the statistics for the Naive Bayes classifier.
    X is a list of feature representations, where each representation
    is a dictionary that maps from the feature name to the value.
    y is a list of integers that represent the labels.
    k is a float which is the smoothing parameters.
    vocab is the set of vocabulary tokens.
    
    Returns two values:
        p_y: A dictionary from the label to the corresponding p(y) score
        p_v_y: A nested dictionary where the outer dictionary's key is
            the label and the innner dictionary maps from a feature
            to the probability p(v|y). For example, `p_v_y[1]["hello"]`
            should be p(v="hello"|y=1).
    """
    # TODO 
    num = len(y) # total number of instances
    p_y = {} # return dic_1
    p_v_y = {} # return dic_2
    num_label = {} # count the number of times each label appears, such as: 0, 1
    docum_label = {} # for one token - over all the documents in the training data has label y
    vocub_label = {} # for all token - over all the words in the vocabulary
    
    # loop through the whole X dataset
    for i in range(num):
        # accumulate the number of each label
        num_label[y[i]] = num_label.get(y[i], 0) + 1
        # assign a new dic(token: value_total) as value in docum_label
        docum_label[y[i]] = docum_label.get(y[i], {})
        # loop through the X, each dic contains features and corresponding value
        for token in X[i]:
            # accumulate by both label and token 
            docum_label[y[i]][token] = docum_label[y[i]].get(token, 0) + X[i][token]
            # accumulate only by label(because this is sum of all words)
            vocub_label[y[i]] = vocub_label.get(y[i], 0) + X[i][token]
    
    # calculate the p_v
    for label, count in num_label.items():
        p_y[label] = count / num
        
    # calculate the p_v_y
    for label, features in docum_label.items():
        p_v_y[label] = p_v_y.get(label, {})
        # loop the word in the vocabulary
        for word in vocab:
            # if current word with this current label exists document
            if word in features:
                p_v_y[label][word] = (k + features[word]) / (k * len(vocab) + vocub_label[label])
            # if not
            else:
                p_v_y[label][word] = k / (k * len(vocab) + vocub_label[label])
                
    return p_y, p_v_y
    
    
    
    
    

In [43]:
def predict_naive_bayes(D, p_y, p_v_y):
    """
    Runs the prediction rule for Naive Bayes. D is a list of documents,
    where each document is a list of tokens.
    p_y and p_v_y are output from `train_naive_bayes`.
    
    Note that any token which is not in p_v_y should be mapped to
    "<unk>". Further, the input dictionaries are probabilities. You
    should convert them to log-probabilities while you compute
    the Naive Bayes prediction rule to prevent underflow errors.
    
    Returns two values:
        predictions: A list of integer labels, one for each document,
            that is the predicted label for each instance.
        confidences: A list of floats, one for each document, that is
            p(y|d) for the corresponding label that is returned.
    """
    # TODO
    predictions = []
    confidences = []
    # loop through the documents
    for d in D:
        # set variables
        p_d = 0
        max_label = None
        max_p = None
        # get the prob of each label
        for label, features in p_v_y.items():
            # p_y
            p_sum = numpy.log(p_y[label])
            # p_y + sum of (p_v_y) in current label
            for token in d:
                # token in p_v_y
                if token in p_v_y[label]:
                    p_sum += numpy.log(p_v_y[label][token])
                # token not in p_v_y
                else:
                    p_sum += numpy.log(p_v_y[label]["<unk>"])
            # p_d = p_d_v for all labels (sum of situations when label =0 or 1)
            p_d += numpy.exp(p_sum)
            # find the label with max p (since p_d is equal, only compare sum)
            if not max_p or p_sum > max_p:
                max_label = label
                max_p = p_sum
        # add to corresponding list
        predictions.append(max_label)
        if p_d == 0:
            confidences.append(1)
        else:
            confidences.append(numpy.exp(max_p)/p_d)
                    
    return predictions, confidences

In [66]:
import random
def train_semi_supervised(X_sup, y_sup, D_unsup, X_unsup, D_valid, y_valid, k, vocab, mode):
    """
    Trains the Naive Bayes classifier using the semi-supervised algorithm.
    
    X_sup: A list of the featurized supervised documents.
    y_sup: A list of the corresponding supervised labels.
    D_unsup: The unsupervised documents.
    X_unsup: The unsupervised document representations. # after 1,2,3 method converting (BBoW, CCoW, tfodf)
    D_valid: The validation documents.
    y_valid: The validation labels.
    k: The smoothing parameter for Naive Bayes.
    vocab: The vocabulary as a set of tokens.
    mode: either "threshold" or "top-k", depending on which selection
        algorithm should be used.
    
    Returns the final p_y and p_v_y (see `train_naive_bayes`) after the
    algorithm terminates.    
    """
    # TODO
    X_initial = X_sup
    y_initial = y_sup
        
    # train the model
    p_y, p_v_y = train_naive_bayes(X_initial, y_initial, k, vocab)
    predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
    initial_accuracy = accuracy_score(y_valid, predictions)
    print("initial accuracy for valid : " + str(initial_accuracy))

    # do the fliter operation

    # threshold
    if mode == "threshold":
        len_new = 0
        len_old = 1
        while len_new != len_old:         
            len_old = len(X_initial)
            p_y, p_v_y = train_naive_bayes(X_initial, y_initial, k, vocab)
            predictions, confidences = predict_naive_bayes(D_unsup, p_y, p_v_y)
            idx = 0
            for conf in confidences:              
                if conf >= 0.98:               
                    X_initial.append(X_unsup[idx])
                    y_initial.append(predictions[idx])
                    del X_unsup[idx]
                    del D_unsup[idx]
                idx += 0
            len_new = len(X_initial)
        predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
        final_accuracy = accuracy_score(y_valid, predictions)
        print("final accuracy for valid: "  + str(final_accuracy))
        
    # top k
    if mode == "top-k":
        p_y, p_v_y = train_naive_bayes(X_initial, y_initial, k, vocab)
        predictions, confidences = predict_naive_bayes(D_unsup, p_y, p_v_y)
        x_y = list(zip(X_unsup, predictions, confidences))
        sort_x_y = sorted(x_y, key=lambda x: x[2], reverse=True)
        top_k = sort_x_y[:10000]
        X_add, y_add, conf = zip(*top_k)
        X_add = list(X_add)
        y_add = list(y_add)
        X_initial.extend(X_add)
        y_initial.extend(y_add)
        p_y, p_v_y = train_naive_bayes(X_initial, y_initial, k, vocab)
        predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
        final_accuracy = accuracy_score(y_valid, predictions)
        print("final accuracy for valid: "  + str(final_accuracy))

    return p_y, p_v_y

In [45]:
# Variables that are named D_* are lists of documents where each
# document is a list of tokens. y_* is a list of integer class labels.
# X_* is a list of the feature dictionaries for each document.
D_train, y_train = load_dataset('data/train.jsonl')
D_valid, y_valid = load_dataset('data/valid.jsonl')
D_test, y_test = load_dataset('data/test.jsonl')

vocab = get_vocabulary(D_train)

In [46]:
# Compute the features, for example, using the BBowFeaturizer.
# You actually only need to conver the training instances to their
# feature-based representations.
# 
# This is just starter code for the experiment. You need to fill in
# the rest.

# 1.3 Navie Bayes Experiment
from sklearn.metrics import accuracy_score
K = [0.001, 0.01, 0.1, 1.0, 10.0]
featurizer1 = BBoWFeaturizer()
X_train1 = convert_to_features(D_train, featurizer1, vocab)
X_test1 = convert_to_features(D_test, featurizer1, vocab)
featurizer2 = CBoWFeaturizer()
X_train2 = convert_to_features(D_train, featurizer2, vocab)
X_test2 = convert_to_features(D_test, featurizer2, vocab)
idf = compute_idf(D_train, vocab)
featurizer3 = TFIDFFeaturizer(idf)
X_train3 = convert_to_features(D_train, featurizer3, vocab)
X_test3 = convert_to_features(D_test, featurizer3, vocab)
# for BBoW:
print("BBoW")
for k in K:
    p_y, p_v_y = train_naive_bayes(X_train1, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(X_test1, p_y, p_v_y)
    acc = accuracy_score(y_test, predictions) 
    print("k = " + str(k) + " : " + str(acc))
# for CCoW:
print("CCoW")
for k in K:
    p_y, p_v_y = train_naive_bayes(X_train2, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(X_test2, p_y, p_v_y)
    acc = accuracy_score(y_test, predictions) 
    print("k = " + str(k) + " : " + str(acc))
# for TFIDF:
print("TFIDF")
for k in K:
    p_y, p_v_y = train_naive_bayes(X_train3, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(X_test3, p_y, p_v_y)
    acc = accuracy_score(y_test, predictions) 
    print("k = " + str(k) + " : " + str(acc))

BBoW
k = 0.001 : 0.8524
k = 0.01 : 0.8572
k = 0.1 : 0.8616
k = 1.0 : 0.8656
k = 10.0 : 0.864
CCoW
k = 0.001 : 0.85
k = 0.01 : 0.8552
k = 0.1 : 0.8604
k = 1.0 : 0.8616
k = 10.0 : 0.8632
TFIDF
k = 0.001 : 0.8408
k = 0.01 : 0.8452
k = 0.1 : 0.8484
k = 1.0 : 0.8504
k = 10.0 : 0.8556


In [68]:
# 1.4 Navie Bayes Experiment
import random

S = list(zip(D_train, y_train)) # zip for seperating easily
Num = [50, 500, 5000] # num of initial instances

# BBoW
for num in Num:
    print("BBoW") 
    print("threshold:")
    print(str(num) + ":")
    random.shuffle(S)
    S1 = S[:num]
    S2 = S[num:]
    D_sup, y_sup = zip(*S1)
    
    D_sup = list(D_sup)
    y_sup = list(y_sup)
    
    D_unsup, y_unsup = zip(*S2)
    
    D_unsup = list(D_unsup)
    
    # for threshold mode
    
    X_sup = convert_to_features(D_sup, featurizer1, vocab)
    X_unsup = convert_to_features(D_unsup, featurizer1, vocab)
    
    p_y, p_v_y = train_semi_supervised(X_sup, y_sup, D_unsup, X_unsup, D_valid, y_valid, 0.1, vocab, "threshold")
    
    # for test accuracy
    predidctions, confidences = predict_naive_bayes(D_test, p_y, p_v_y)
    final_accuracy = accuracy_score(y_test, predictions)
    print("final accuracy for test: "  + str(final_accuracy))
    
 
    
for num in Num:
    print("top-k:")
    print(str(num) + ":")
    random.shuffle(S)
    S1 = S[:num]
    S2 = S[num:]
    D_sup, y_sup = zip(*S1)
    
    D_sup = list(D_sup)
    y_sup = list(y_sup)
    
    D_unsup, y_unsup = zip(*S2)
    
    D_unsup = list(D_unsup)
    
    # for threshold mode

    X_sup = convert_to_features(D_sup, featurizer1, vocab)
    X_unsup = convert_to_features(D_unsup, featurizer1, vocab)
    p_y, p_v_y = train_semi_supervised(X_sup, y_sup, D_unsup, X_unsup, D_valid, y_valid, 0.1, vocab, "top-k")
    # for test accuracy
    predidctions, confidences = predict_naive_bayes(D_test, p_y, p_v_y)
    final_accuracy = accuracy_score(y_test, predictions)
    print("final accuracy for test: "  + str(final_accuracy))
    
print("---------------------------------------------------")  

# CCoW
# BBoW
for num in Num:
    print("CCoW") 
    print("threshold:")
    print(str(num) + ":")
    random.shuffle(S)
    S1 = S[:num]
    S2 = S[num:]
    D_sup, y_sup = zip(*S1)
    
    D_sup = list(D_sup)
    y_sup = list(y_sup)
    
    D_unsup, y_unsup = zip(*S2)
    
    D_unsup = list(D_unsup)
    
    # for threshold mode
    
    X_sup = convert_to_features(D_sup, featurizer2, vocab)
    X_unsup = convert_to_features(D_unsup, featurizer2, vocab)
    p_y, p_v_y = train_semi_supervised(X_sup, y_sup, D_unsup, X_unsup, D_valid, y_valid, 0.1, vocab, "threshold")
    # for test accuracy
    predidctions, confidences = predict_naive_bayes(D_test, p_y, p_v_y)
    final_accuracy = accuracy_score(y_test, predictions)
    print("final accuracy for test: "  + str(final_accuracy))
    
    
for num in Num:
    print("top-k:")
    print(str(num) + ":")
    random.shuffle(S)
    S1 = S[:num]
    S2 = S[num:]
    D_sup, y_sup = zip(*S1)
    
    D_sup = list(D_sup)
    y_sup = list(y_sup)
    
    D_unsup, y_unsup = zip(*S2)
    
    D_unsup = list(D_unsup)
    
    # for threshold mode

    X_sup = convert_to_features(D_sup, featurizer2, vocab)
    X_unsup = convert_to_features(D_unsup, featurizer2, vocab)
    p_y, p_v_y = train_semi_supervised(X_sup, y_sup, D_unsup, X_unsup, D_valid, y_valid, 0.1, vocab, "top-k")
    # for test accuracy
    predidctions, confidences = predict_naive_bayes(D_test, p_y, p_v_y)
    final_accuracy = accuracy_score(y_test, predictions)
    print("final accuracy for test: "  + str(final_accuracy))
    
print("---------------------------------------------------")  

# TFIDF
# BBoW
for num in Num:
    print("TFIDF") 
    print("threshold:")
    print(str(num) + ":")
    random.shuffle(S)
    S1 = S[:num]
    S2 = S[num:]
    D_sup, y_sup = zip(*S1)
    
    D_sup = list(D_sup)
    y_sup = list(y_sup)
    
    D_unsup, y_unsup = zip(*S2)
    
    D_unsup = list(D_unsup)
    
    # for threshold mode
    
    X_sup = convert_to_features(D_sup, featurizer3, vocab)
    X_unsup = convert_to_features(D_unsup, featurizer3, vocab)
    p_y, p_v_y = train_semi_supervised(X_sup, y_sup, D_unsup, X_unsup, D_valid, y_valid, 0.1, vocab, "threshold")
    # for test accuracy
    predidctions, confidences = predict_naive_bayes(D_test, p_y, p_v_y)
    final_accuracy = accuracy_score(y_test, predictions)
    print("final accuracy for test: "  + str(final_accuracy))
    
    
for num in Num:
    print("top-k:")
    print(str(num) + ":")
    random.shuffle(S)
    S1 = S[:num]
    S2 = S[num:]
    D_sup, y_sup = zip(*S1)
    
    D_sup = list(D_sup)
    y_sup = list(y_sup)
    
    D_unsup, y_unsup = zip(*S2)
    
    D_unsup = list(D_unsup)
    
    # for threshold mode

    X_sup = convert_to_features(D_sup, featurizer3, vocab)
    X_unsup = convert_to_features(D_unsup, featurizer3, vocab)
    p_y, p_v_y = train_semi_supervised(X_sup, y_sup, D_unsup, X_unsup, D_valid, y_valid, 0.1, vocab, "top-k")
    # for test accuracy
    predidctions, confidences = predict_naive_bayes(D_test, p_y, p_v_y)
    final_accuracy = accuracy_score(y_test, predictions)
    print("final accuracy for test: "  + str(final_accuracy))
    
print("---------------------------------------------------")  


BBoW
threshold:
50:
initial accuracy: 0.5284
final accuracy: 0.5168
final accuracy for test: 0.8556
BBoW
threshold:
500:
initial accuracy: 0.7544
final accuracy: 0.4836
final accuracy for test: 0.8556
BBoW
threshold:
5000:
initial accuracy: 0.8268
final accuracy: 0.5444
final accuracy for test: 0.8556
top-k:
50:
initial accuracy: 0.5188
final accuracy: 0.5168
final accuracy for test: 0.8556
top-k:
500:
initial accuracy: 0.7648
final accuracy: 0.7524
final accuracy for test: 0.8556
top-k:
5000:
initial accuracy: 0.824
final accuracy: 0.7796
final accuracy for test: 0.8556
---------------------------------------------------
CCoW
threshold:
50:
initial accuracy: 0.5776
final accuracy: 0.5168
final accuracy for test: 0.8556
CCoW
threshold:
500:
initial accuracy: 0.7848
final accuracy: 0.5164
final accuracy for test: 0.8556
CCoW
threshold:
5000:
initial accuracy: 0.826
final accuracy: 0.5728
final accuracy for test: 0.8556
top-k:
50:
initial accuracy: 0.5944
final accuracy: 0.6388
final acc