In this notebook we compare 2 simple text classifiers. One is SVC and another is a neural network with few layers. We use NLTK and wordnet database for tokenization and lemmatization and sklearn's TfidfVectorizer and gensim's word2vec to vectorize words for models.
First we're going to load some labeled texts. Labels are 0 or 1 for negative or positive sentiment they express respectively. Then we'll brake texts into separate words, lemmatize and exclude stop-words. Next we'll train our models. SVC will use tfid vectorization, while neural natwork will deal with word2vec text representation model.


Data loading logic

In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_dataset(path, sep="\t", header=None, train_ratio = 0.7):
    """Loads labeled dataset and returns (train, test) pd dataframe tuple 
    """
    d = pd.read_csv(path,sep=sep,header=header)
    train_count = round(train_ratio*d.shape[0])
    d_train = d[:train_count]
    d_test = d[train_count:d.shape[0]]
    return (d_train, d_test.reset_index(drop=True))

Data preparation logic

In [9]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
lm = WordNetLemmatizer()

In [118]:
def get_wordnet_pos(pos_tag):
    """Converts POS returned by nltk.pos_tag to wordnet's POS
    """
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        #Noun is a default case for WordNetLemmatizer
        #so if we can't tell the lemma, lemmatizer consider it as a noun
        #thus it's correct to return noun by default
        return wordnet.NOUN

def lemma_by_pos(tagged_word):
    """Get word's lemma by its pos-tag
    For instance: lemma_by_pos(("going","v")) returns "go"
    """
    return lm.lemmatize(tagged_word[0],get_wordnet_pos(tagged_word[1]))

def tokenize_and_lemmatize(texts):
    """Tokenizes and then lemmatizes list of texts into list of lists of word lemmas
    """
    #Get a list of pos-tagged and tokenized text
    tag_tok = [nltk.pos_tag(word_tokenize(text)) for text in texts]
    return [[lemma_by_pos(tok) for tok in tok_list] for tok_list in tag_tok]

#Supposingly, these words are not usefull for classification so they are to be excluded from texts
stop_words = stopwords.words('english')
stop_words.extend(['.', ';', '!',',','-','&','...','?',':',')','(',"'s","n't","''","``","'"]+[i for i in "1234567890"])
def prepare_data(texts):
    """Lematize the list of documents and drop stop words from it.
    Returns list of list of tokens
    """
    tokenized = tokenize_and_lemmatize(texts)
    return [[token for token in text if (not token in stop_words)]for text in tokenized]

#This is for SVC classifier as it uses TfidfVectorizer which needs raw text lists
def prepare_data_raw(texts):
    """Lematize the list of documents and drop stop words from it.
    Returns list of string texts
    """
    return [" ".join(text) for text in prepare_data(texts)]

#This is for NN classifier, which is using (n,100,300) tensors in which each of n sample texts
#is being represented as a seqence of 300-dimension vectors (each word is a vector) and padded with zero vectors
#to have length of 100. This illustrates what was said above:
#
#       |sample_1| 
#       |sample_2|
#data = |...     |, where sample_i = (word_1, word_2, ..., word_100), 
#       |...     |   where word_i = (w1, w2, ..., w300),
#       |sample_n|   where w_i is a float number.
#
def prepare_data_v2w(texts, wvmodel):
    """Lemmatize the list of documents and drop stop words from it.
    Returns (n, 100, 300) tensor for built NN
    """
    vectors = [[wvmodel.wv[word] for word in text] for text in prepare_data(texts)]
    return sequence.pad_sequences(vectors, maxlen=20,value=[np.array(range(100))])

Classification logic

In [131]:
#SVC
from sklearn import datasets, grid_search
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import KFold

def classify_SVC(data, target, t_data, t_target):
    """Returns SVC model and its score on given data
    """
    #Note! We use both test and train data to construct a dictionary
    prepared_dictionary = prepare_data(np.concatenate([data,test_data]))
    prepared_data = prepare_data(data)
    #Find tfid representation among all documents
    raw = " ".join([word for text in prepared_dictionary for word in text])
    vectorizer = TfidfVectorizer()
    vectorizer.fit_transform([raw])
    #Find the best C parametr for SVC classifier using grid search
    grid = {'C': np.power(10.0, np.arange(-5, 6))}
    cv = KFold(target.size, n_folds=5, shuffle=True, random_state=241)
    model = SVC(kernel='linear', random_state=241)
    gs = grid_search.GridSearchCV(model, grid, scoring='accuracy', cv=cv)
    gs.fit(vectorizer.transform(prepare_data_raw(data)), target)
    best_param = max(gs.grid_scores_,key=lambda x: x.mean_validation_score)
    #Teach SVC with the best C
    model = SVC(kernel='linear', random_state=241, C=best_param[0]['C'])
    model.fit(vectorizer.transform(prepare_data_raw(data)), target)
    return (model, model.score(vectorizer.transform(prepare_data_raw(t_data)),t_target))
#NN
from keras.layers import Dense, Activation
from keras.layers import LSTM, GRU, Dropout, BatchNormalization
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import RMSprop, Adam
from keras.preprocessing import sequence
from gensim.models import word2vec
from keras.utils import to_categorical

def classify_NN(data, target, t_data, t_target):
    #Note! We use both test and train data to construct a dictionary
    prepared_dictionary = prepare_data(np.concatenate([data,test_data]))
    prepared_data = prepare_data(data)
    #Build the word to vec model
    wvmodel = word2vec.Word2Vec(prepared_dictionary,iter=20,min_count = 1,size=100,workers=4)
    #Build NN
    input = Input(shape=(None, 100))
    #out = GRU(100, return_sequences=True)(input)
    out = Dense(64)(input)
    #out = Dense(256)(out)
    out = LSTM(32)(out)
    out = Dense(2, activation='softmax')(out)
    model = Model(input, out)
    model.compile("adam", loss='categorical_crossentropy')
    #Converts a class vector (integers) to binary class matrix (because the loss is categorical_crossentropy)
    target = to_categorical(target)
    #Train model
    model.fit(prepare_data_v2w(data, wvmodel),target,epochs=4)
    #Test and return score and the model
    return (model, model.test_on_batch(prepare_data_v2w(test_data, wvmodel),to_categorical(test_target)))

Now we load the data try out classifiers 

In [135]:
train,test = load_dataset("yelp_labelled.txt")
target = train[1]
data = train[0]
test_target = test[1]
test_data = test[0]

In [136]:
print(f"SVC score: {classify_SVC(data,target,test_data,test_target)[1]}")

SVC score: 0.71


In [137]:
print(f"NN score: {classify_NN(data,target,test_data,test_target)[1]}")

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
NN score: 0.7270189523696899
