In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.sparse import coo_matrix
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

In [2]:
import nltk
nltk.download ('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\trang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Data Loader

In [3]:
def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of reviews, a list of labels
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)

    return df['id'], df["text"], df['label']

def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)['label']

def write_predictions(file_name, pred):
    df = pd.DataFrame(zip(range(len(pred)), pred))
    df.columns = ["id", "label"]
    df.to_csv(file_name, index=False)

## Feature Extractor

In [5]:
def tokenize(text):
    return nltk.word_tokenize(text)
def stem(tokens):
    return [ps.stem(token) for token in tokens]
def n_gram(tokens, n=1):
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results
def filter_stopwords(tokens):
    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

In [6]:
def get_onehot_vector(feats, feats_dict):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

## Classifier

We will use Naive Bayes algorithm
For more information, please check the web page:https://scikit-learn.org/stable/modules/naive_bayes.html 

In [7]:
clf0 = GaussianNB()

In [None]:
train_file = "data/train.csv"
test_file = "data/test.csv"
ans_file = "data/answer.csv"
pred_file = "data/pred.csv"

# load data
train_ids, train_texts, train_labels = load_data(train_file)
test_ids, test_texts, _ = load_data(test_file)
test_labels = load_labels(ans_file)

# extract features

# tokenization
train_tokens = [tokenize(text) for text in train_texts] 
test_tokens = [tokenize(text) for text in test_texts]

# stemming
train_stemmed = [stem(tokens) for tokens in train_tokens]
test_stemmed = [stem(tokens) for tokens in test_tokens]

# n-gram
train_2_gram = [n_gram(tokens, 2) for tokens in train_stemmed]
train_3_gram = [n_gram(tokens, 3) for tokens in train_stemmed]
train_4_gram = [n_gram(tokens, 4) for tokens in train_stemmed]

test_2_gram = [n_gram(tokens, 2) for tokens in test_stemmed]
test_3_gram = [n_gram(tokens, 3) for tokens in test_stemmed]
test_4_gram = [n_gram(tokens, 4) for tokens in test_stemmed]


# remove stopwords
# the input should be the stemmed tokens and the output is a cleanner token list
train_stemmed = [filter_stopwords(tokens) for tokens in train_stemmed]
test_stemmed = [filter_stopwords(tokens) for tokens in test_stemmed]

We only use those features which occur more than 10 times

In [None]:
# build a set containing each unique feature which has appeared more than 10 times in the training set
feats_set = set()

# build a Counter for stemmed features, e.g., {"text": 2, "mine": 1}
stemmed_feat_cnt = Counter()

for feats in train_stemmed:
    stemmed_feat_cnt.update(feats)

# add those stem features which occurs more than 10 times into the feature set.
feats_set.update([f for f, cnt in stemmed_feat_cnt.items() if cnt > 10]) 
    

# build a Counter for 2-gram features
bi_gram_feat_cnt = Counter()
for feats in train_2_gram:
    bi_gram_feat_cnt.update(feats)

# add those 2-gram features which occurs more than 10 times into the feature set.
feats_set.update([f for f, cnt in bi_gram_feat_cnt.items() if cnt > 10]) 


# build a Counter for 3-gram features
tri_gram_feat_cnt = Counter()

for feats in train_3_gram:
    tri_gram_feat_cnt.update(feats)

# add those 3-gram features which occurs more than 10 times into the feature set.
feats_set.update([f for f, cnt in tri_gram_feat_cnt.items() if cnt > 10]) 


# first, build a Counter for 4-gram features
four_gram_feat_cnt = Counter()

for feats in train_4_gram:
    four_gram_feat_cnt.update(feats)
    
# add those 4-gram features which occurs more than 10 times into the feature set.
feats_set.update([f for f, cnt in four_gram_feat_cnt.items() if cnt > 10]) 


print("Size of features:", len(feats_set))

# build the feature dict mapping each feature to its index 
feats_dict = dict(zip(feats_set, range(len(feats_set))))

In [None]:
# build the feature list
train_feats = list()
for i in range(len(train_ids)):
    # concatenate the stemmed token list and all n-gram list together
    train_feats.append(train_stemmed[i] + train_2_gram[i] + train_3_gram[i] + train_4_gram[i])
test_feats = list()
for i in range(len(test_ids)):
    # concatenate the stemmed token list and all n-gram list together
    test_feats.append(test_stemmed[i] + test_2_gram[i]+ test_3_gram[i] + test_4_gram[i])



# build the feats_matrix
# We first convert each example to a ont-hot vector, and then stack vectors as a matrix. Afterwards,
# we save this feature matirx in a COO sparse matrix format to reduce memory consumption.
# See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html and 
# https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO) for details.

train_feats_matrix = coo_matrix(np.vstack([get_onehot_vector(f, feats_dict) for f in train_feats]))
test_feats_matrix = coo_matrix(np.vstack([get_onehot_vector(f, feats_dict) for f in test_feats]))

# Fit the feature matrix and labels to train the classifier.
# Since the classifier can only process matrices in the dense format,
# we use toarray() function to get the dense representation of the sparse 
# matrix before passing it to the classifier
clf0.fit(train_feats_matrix.toarray(), train_labels.values)

Evaluate the classifier on train and test set

In [None]:
#Get the predictions of the classifier
train_pred = clf0.predict(train_feats_matrix.toarray())
test_pred = clf0.predict(test_feats_matrix.toarray())

#Compute accuracy scores
train_score = accuracy_score(train_labels.values, train_pred)
test_score = accuracy_score(test_labels.values, test_pred)
print("training accuracy", train_score)
print("test accuracy", test_score)

## Cross Validation and Ensemble

We can use cross validation with the ensemble technique to reduce overfitting as well as the randomness issue.

In [None]:
# n_fold document: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
n_fold = 3
np.random.seed(0)
# create the n-fold generator
skf = StratifiedKFold(n_fold, shuffle=True)

clfs_1 = list()
valid_acc_list = list()
for k, (train_idx, valid_idx) in enumerate(
    skf.split(train_feats_matrix.toarray(), train_labels)):
    # build the classifier and train
    clf = GaussianNB()
    
    clf.fit(train_feats_matrix.toarray()[train_idx], train_labels.values[train_idx])
    
    #Get the predictions of the classifier
    train_pred = clf.predict(train_feats_matrix.toarray()[train_idx])
    valid_pred = clf.predict(train_feats_matrix.toarray()[valid_idx])

    #Compute accuracy scores
    train_score = accuracy_score(train_labels.values[train_idx], train_pred)
    valid_score = accuracy_score(train_labels.values[valid_idx], valid_pred)
    
    print("training accuracy", train_score)
    print("validation accuracy", valid_score)
    
    clfs_1.append(clf)
    valid_acc_list.append(valid_score)
    
print('Average validation score: ', sum(valid_acc_list)/len(valid_acc_list))