# 1_Sentence Classification Model Building


# Parse & clearn labeled training data

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random
import os, sys
from sklearn.model_selection import train_test_split
import json
import pandas as pd
import os
import pickle

In [2]:
import xml.etree.ElementTree as ET
tree = ET.parse('Restaurants_Train.xml')
root = tree.getroot()

In [3]:
root

<Element 'sentences' at 0x1a1ef5f4f8>

In [4]:
# Use this dataframe for multilabel classification
# Must use scikitlearn's multilabel binarizer

labeled_reviews = []
for sentence in root.findall("sentence"):
    entry = {}
    aterms = []
    aspects = []
    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aterms.append(aterm.get("term"))
    if sentence.find("aspectCategories"):
        for aspect in sentence.find("aspectCategories").findall("aspectCategory"):
            aspects.append(aspect.get("category"))
    entry["text"], entry["terms"], entry["aspects"]= sentence[0].text, aterms, aspects
    labeled_reviews.append(entry)
labeled_df = pd.DataFrame(labeled_reviews)
print("there are",len(labeled_reviews),"reviews in this training set")
#    print(sentence.find("aspectCategories").findall("aspectCategory").get("category"))

there are 3044 reviews in this training set


In [5]:
# Save annotated reviews
labeled_df.to_pickle("annotated_reviews_df.pkl")
labeled_df.head()

Unnamed: 0,aspects,terms,text
0,[service],[staff],But the staff was so horrible to us.
1,"[food, anecdotes/miscellaneous]",[food],"To be completely fair, the only redeeming fact..."
2,[food],"[food, kitchen, menu]","The food is uniformly exceptional, with a very..."
3,[service],[],Where Gabriela personaly greets you and recomm...
4,[anecdotes/miscellaneous],[],"For those that go once and don't enjoy it, all..."


# Training the model with Naive Bayes
1. replace pronouns with neural coref
2. train the model

In [6]:
#from neuralcoref import Coref
import en_core_web_sm
spacy = en_core_web_sm.load()
#coref = Coref(nlp=spacy)

In [7]:
# Define function for replacing pronouns using neuralcoref
def replace_pronouns(text):
    coref.one_shot_coref(text)
    return coref.get_resolved_utterances()[0]

In [9]:
# Read annotated reviews df, which is the labeled dataset for training
# This is located in the pickled files folder
annotated_reviews_df = pd.read_pickle("annotated_reviews_df.pkl")
annotated_reviews_df.head(3)

Unnamed: 0,aspects,terms,text
0,[service],[staff],But the staff was so horrible to us.
1,"[food, anecdotes/miscellaneous]",[food],"To be completely fair, the only redeeming fact..."
2,[food],"[food, kitchen, menu]","The food is uniformly exceptional, with a very..."


In [21]:
# Create a new column for text whose pronouns have been replaced
annotated_reviews_df["text_pro"] = annotated_reviews_df.text.map(lambda x: replace_pronouns(x))

In [22]:
# uncomment below to pickle the new df
# annotated_reviews_df.to_pickle("annotated_reviews_df2.pkl")

# Read pickled file with replaced pronouns if it exists already
annotated_reviews_df = pd.read_pickle("annotated_reviews_df2.pkl")

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Convert the multi-labels into arrays
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(annotated_reviews_df.aspects)
X = annotated_reviews_df.text#_pro

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

# save the the fitted binarizer labels
# This is important: it contains the how the multi-label was binarized, so you need to
# load this in the next folder in order to undo the transformation for the correct labels.
filename = 'mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))

In [118]:
## Baseline model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import f1_score
import numpy as np

# LabelPowerset allows for multi-label classification
# Build a pipeline for multinomial naive bayes classification

text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                     #('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB()))])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted, average='weighted')  ))

mean accuracy: 0.8625492772667542
f1 score: 0.6767426687378034


In [12]:
## with tfidf transformer:
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-2))),])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted, average='weighted')  ))

mean accuracy: 0.8688567674113009
f1 score: 0.7141696449199172


In [13]:
# Test if SVM performs better
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=10, random_state=42)))])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

#Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted_svm == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_svm, average='weighted')  ))

mean accuracy: 0.8696452036793693
f1 score: 0.6920267443287555


In [119]:
## random forest
from sklearn.ensemble import RandomForestClassifier
text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-rf', LabelPowerset(
                             RandomForestClassifier(n_estimators = 100)))])
_ = text_clf_rf.fit(X_train, y_train)
predicted_rf = text_clf_rf.predict(X_test)

#Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted_rf == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_rf, average='weighted')  ))

mean accuracy: 0.8638633377135349
f1 score: 0.684833424807028


In [120]:
# GBDT
from sklearn.ensemble import GradientBoostingClassifier

text_clf_gbdt = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-gbdt', LabelPowerset(
                             GradientBoostingClassifier(n_estimators=200)))])
_ = text_clf_gbdt.fit(X_train, y_train)

predicted_gbdt = text_clf_gbdt.predict(X_test)

#Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted_gbdt == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_gbdt, average='weighted')  ))

mean accuracy: 0.859395532194481
f1 score: 0.6954693598579997


In [40]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-lr', LabelPowerset(
                             LogisticRegression(C=10)))])
_ = text_clf_lr.fit(X_train, y_train)
predicted_lr = text_clf_lr.predict(X_test)

#Calculate accuracy
np.mean(predicted_lr == y_test)

print('mean accuracy: {}'.format(np.mean(predicted_lr == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_lr, average='weighted')  ))

mean accuracy: 0.8877792378449408
f1 score: 0.7511756388967253


In [26]:
# Neural network
from sklearn.neural_network import MLPClassifier

text_clf_mlp = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-mlp', LabelPowerset(
                             MLPClassifier(hidden_layer_sizes=(200,100))))])
_ = text_clf_mlp.fit(X_train, y_train)
predicted_mlp = text_clf_mlp.predict(X_test)

#Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted_mlp == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_mlp, average='weighted')  ))

mean accuracy: 0.8825229960578187
f1 score: 0.748642273567034


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

text_clf_mlp = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-mlp', LabelPowerset(
                             MLPClassifier(hidden_layer_sizes=(20,10),max_iter=500)))])

param_dict = {
    "clf-mlp__classifier__activation": ['identity', 'logistic', 'tanh','relu']
}
kf = kfolds = KFold(n_splits = 4,shuffle=True)
#clf = MLPClassifier(max_iter=5000, learning_rate="adaptive")
#gs = GridSearchCV(clf, param_dict, cv=kf)
gs = GridSearchCV(text_clf_mlp, param_grid = param_dict, cv = kf)
_ = gs.fit(X_train, y_train)

predicted_mlp = gs.predict(X_test)

#Calculate accuracy
np.mean(predicted_mlp == y_test)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-lr', LabelPowerset(
                             LogisticRegression(C=10)))])

param_dict = {
    #"clf-lr__classifier__C": [1e-1,1e-2,1e-3,1e-4,1,10,100],
    #'clf-mlp__classifier__hidden_layer_sizes':[(100,100),(100,50),(100,200),(200,100),(100,50,10),(200)],
    'clf-lr__classifier__class_weight':['balanced',None],
    'clf-lr__classifier__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
kf = kfolds = KFold(n_splits = 5,shuffle=True)
#clf = MLPClassifier(max_iter=5000, learning_rate="adaptive")
#gs = GridSearchCV(clf, param_dict, cv=kf)
gs = GridSearchCV(text_clf_lr, param_grid = param_dict, cv = kf, verbose=1)
_ = gs.fit(X_train, y_train)

predicted_lr = gs.predict(X_test)

#Calculate accuracy
np.mean(predicted_Lr == y_test)

In [None]:
gs.best_estimator_

In [65]:
import pickle
# Train naive bayes on full dataset and save model
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-lr', LabelPowerset(
                             LogisticRegression(C=10)))])
text_clf.fit(X, y)


# save the model to disk
filename = 'naive_model1.pkl'
pickle.dump(text_clf, open(filename, 'wb'))

In [66]:
#mlb.inverse_transform(predicted)
pred_df = pd.DataFrame(
    {'text_pro': X_test,
     'pred_category': mlb.inverse_transform(predicted)
    })

In [67]:
pd.set_option('display.max_colwidth', -1)
pred_df.head()

Unnamed: 0,text_pro,pred_category
453,It's better than being on the roof of Sutton Place with 19 year old interns jabbing you in the ribs all night.,"(anecdotes/miscellaneous,)"
1611,"Don't expect to sit down inside though, there are only a few tables and they are always full.","(ambience,)"
2078,"Again, if you are in this neighborhood - by all means, come here.","(anecdotes/miscellaneous,)"
2715,Go there to relax and feel like your somewhere else.,"(ambience,)"
2602,"As far as the service goes, the waitresses were not particularly friendly, but they got the job done.","(service,)"


In [68]:
ft_home = './'
words_to_load = 100000

import numpy as np

with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
    loaded_embeddings_ft[1,:]=np.random.randn(300)*0.01
    words_ft = {'<pad>':0,'<unk>':1}
    idx2words_ft = {0:'<pad>',1:'<unk>'}
    ordered_words_ft = ['<pad>','<unk>']
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+2
        idx2words_ft[i+2] = s[0]
        ordered_words_ft.append(s[0])

In [69]:
import string
punctuations = string.punctuation

def tokenize(sent):
    sent = sent.replace('<br />','')
    tokens = spacy(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]
# takenize the sentence and change to lowercase

def tokenize_dataset(dataset):
    new_data = []
    for sent in dataset:
        new_data.append(tokenize(sent))
    return new_data

def token2id(data,max_length):
    data_id = []
    for i in range(len(data)):
        sent_id_1 = []
        for word in data[i]:
            if word in words_ft:
                sent_id_1.append(words_ft[word])
            else:
                word = '<unk>'
                sent_id_1.append(words_ft[word])      
        data_id.append(sent_id_1[:max_length])
    return data_id

def padding_embedding(data):
    data_paded_embeded = []
    for sent in data:
        sent_pad = np.pad(np.array(sent), pad_width=((0,MAX_SENTENCE_LENGTH-len(sent))), mode="constant", constant_values=0)
        sent_embed = []
        for word in sent_pad:
            sent_embed.append(loaded_embeddings_ft[word])
        data_paded_embeded.append([item for sublist in sent_embed for item in sublist])
    return data_paded_embeded
#padded_vec1 = np.pad(np.array(xtemp[4]), pad_width=((0,MAX_SENTENCE_LENGTH-len(xtemp[4]))), mode="constant", constant_values=0)

In [311]:
X_train = tokenize_dataset(X_train)
X_train = token2id(X_train,MAX_SENTENCE_LENGTH)

In [74]:
#xtemp = X_test.head()
MAX_SENTENCE_LENGTH = 20
X_test = tokenize_dataset(X_test)
X_test = token2id(X_test,MAX_SENTENCE_LENGTH)
X_test = padding_embedding(X_test)

X_train = tokenize_dataset(X_train)
X_train = token2id(X_train,MAX_SENTENCE_LENGTH)
X_train = padding_embedding(X_train)

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
text_clf = Pipeline([('clf', LabelPowerset(LogisticRegression(C=10)))])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted, average='weighted') ))

mean accuracy: 0.8654402102496714
f1 score: 0.695831232447296
