# Aspect category classification 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth' , -1)

In [2]:
train_grp_df = pd.read_csv('../data/resturant_train_stratified_grouped.csv')
test_grp_df  = pd.read_csv('../data/resturant_test_stratified_grouped.csv')
train_grp_df.shape , test_grp_df.shape

((2206, 5), (649, 5))

In [3]:
train_grp_df.head(2)

Unnamed: 0,aspects,polarities,text,length,ind
0,['service'],['positive'],Service is fast and friendly.,1,0
1,['anecdotes/miscellaneous'],['negative'],I HATE HATE HATE this place.,1,0


First step of Aspect Based Sentiment Analysis is to extract the hidden aspect categories in given text/review. We are using a supurvised method for this.  We will learn a LinearSVM on tf-idf vectors for this multi label classification task.

In [4]:
import ast 
train_grp_df['aspects'] = train_grp_df['aspects'].apply(lambda x: ast.literal_eval(x))
train_grp_df['polarities'] = train_grp_df['polarities'].apply(lambda x: ast.literal_eval(x))

test_grp_df['aspects'] = test_grp_df['aspects'].apply(lambda x: ast.literal_eval(x))
test_grp_df['polarities'] = test_grp_df['polarities'].apply(lambda x: ast.literal_eval(x))

In [5]:
train_grp_df.head(2)

Unnamed: 0,aspects,polarities,text,length,ind
0,[service],[positive],Service is fast and friendly.,1,0
1,[anecdotes/miscellaneous],[negative],I HATE HATE HATE this place.,1,0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import codecs

def parse_sentence(line):
    lmtzr = WordNetLemmatizer()
    stop = stopwords.words('english')
    try:
        text_token = CountVectorizer().build_tokenizer()(line.lower())
        text_rmstop = [i for i in text_token if i not in stop]
        text_stem = [lmtzr.lemmatize(w) for w in text_rmstop]
        return text_stem
    except:
        return []


In [7]:
train_grp_df['pro_text'] = train_grp_df['text'].apply(lambda x: parse_sentence(x))
test_grp_df['pro_text'] = test_grp_df['text'].apply(lambda x: parse_sentence(x))

train_grp_df['clean_text'] = train_grp_df['pro_text'].apply(lambda x: ' '.join(x))
test_grp_df['clean_text'] = test_grp_df['pro_text'].apply(lambda x: ' '.join(x))

Preprocess text by tokenizing and lemmatizing then create tf-idf vectors on processed text. Then we create a multi label classifier using Linear SVM to predict aspects.

In [8]:
train_grp_df.head(2)

Unnamed: 0,aspects,polarities,text,length,ind,pro_text,clean_text
0,[service],[positive],Service is fast and friendly.,1,0,"[service, fast, friendly]",service fast friendly
1,[anecdotes/miscellaneous],[negative],I HATE HATE HATE this place.,1,0,"[hate, hate, hate, place]",hate hate hate place


In [9]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer ,TfidfTransformer
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import LabelPowerset
from sklearn.linear_model import SGDClassifier

In [10]:
# Convert the multi-labels into arrays
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_grp_df.aspects)
y_test = mlb.fit_transform(test_grp_df.aspects)


In [11]:
y_train[0:5]

array([[0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1]])

In [12]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='log', penalty='l2', l1_ratio = 0.4,
                                           alpha=1e-4, max_iter=40, random_state=42 )))])
_ = text_clf_svm.fit(train_grp_df['clean_text'] , y_train)
predicted_svm = text_clf_svm.predict(test_grp_df['clean_text'])
print('Test accuracy' , np.mean(predicted_svm == y_test))



Test accuracy 0.8665639445300463


In [13]:
"""
Function to tweak threshold and determine aspect prediction using that threshold
"""

def make_pred(row , thresh):
    cols = mlb.classes_.tolist()
    res_asp = []
    for ele in cols:
        if row[ele]>thresh:
            res_asp.append(ele)

    if len(res_asp)==0:
        best_sr = cols[np.argmax(row[cols].tolist())]
        res_asp.append(best_sr)
        return res_asp
    return res_asp
        

In [14]:
test_prob_df = pd.DataFrame(text_clf_svm.predict_proba(test_grp_df['clean_text']).toarray() , columns = mlb.classes_ , index = test_grp_df.index)
test_grp_df= pd.merge(test_grp_df, test_prob_df , left_index = True , right_index=True)

In [15]:
test_grp_df['aspects_pred'] =test_grp_df.apply(lambda x: make_pred(x ,0.5) , axis=1)

In [16]:
test_grp_df[['text' , 'aspects' , 'aspects_pred']].head()

Unnamed: 0,text,aspects,aspects_pred
0,"Our teenage kids love it, too.",[anecdotes/miscellaneous],[anecdotes/miscellaneous]
1,I recommend to anyone who wants to dress up and impress the lady.,[anecdotes/miscellaneous],[anecdotes/miscellaneous]
2,He has visited Thailand and is quite expert on the cuisine.,[food],[food]
3,We were seated outside and the waiter spilled red wine and hot tea on myself and my date.,[service],[food]
4,"The crust is thin, the ingredients are fresh and the staff is friendly.","[food, service]",[food]


In [34]:
from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score
print("AUC ROC"  , roc_auc_score(y_test ,text_clf_svm.predict_proba(test_grp_df['clean_text']).toarray() ))#
print("F1 score",f1_score(y_test ,mlb.transform(test_grp_df.aspects_pred) , average='macro' ))
print('Overall accuracy' , np.mean(predicted_svm == y_test))

AUC ROC 0.9265175034852803
F1 score 0.6629534590547215
Overall accuracy 0.8665639445300463


In [19]:
print('Accuracy for different aspect categories ')
print("ambience" ,   accuracy_score( y_test[: , 0] ,mlb.transform(test_grp_df.aspects_pred)[:,0] ))
print("anecdotes/miscellaneous" ,   accuracy_score( y_test[: , 1] ,mlb.transform(test_grp_df.aspects_pred)[:,1] ))
print("food" ,   accuracy_score( y_test[: , 2] ,mlb.transform(test_grp_df.aspects_pred)[:,2] ))
print("price" ,   accuracy_score( y_test[: , 3] ,mlb.transform(test_grp_df.aspects_pred)[:,3] ))
print("service" ,   accuracy_score( y_test[: , 4] ,mlb.transform(test_grp_df.aspects_pred)[:,4] ))

Accuracy for different aspect categories 
ambience 0.8936825885978429
anecdotes/miscellaneous 0.8012326656394453
food 0.847457627118644
price 0.9198767334360555
service 0.8828967642526965


### Look for the accuracy of conflicting statements (changing sentiment in single review for different aspects)


In [36]:
test_grp_df2 = test_grp_df[test_grp_df['ind']==1]
y_test_conf = mlb.fit_transform(test_grp_df2.aspects)

print("F1 score",f1_score(y_test_conf ,mlb.transform(test_grp_df2.aspects_pred) , average='macro' ))
print('Overall accuracy' , np.mean(text_clf_svm.predict(test_grp_df2['clean_text']) == y_test_conf))

AUC ROC 0.9007480631574447
F1 score 0.5800866845743291
Overall accuracy 0.7533980582524272


In [37]:
print('Accuracy for different aspect categories ')
print("ambience" ,   accuracy_score( y_test_conf[: , 0] ,mlb.transform(test_grp_df2.aspects_pred)[:,0] ))
print("anecdotes/miscellaneous" ,   accuracy_score( y_test_conf[: , 1] ,mlb.transform(test_grp_df2.aspects_pred)[:,1] ))
print("food" ,   accuracy_score( y_test_conf[: , 2] ,mlb.transform(test_grp_df2.aspects_pred)[:,2] ))
print("price" ,   accuracy_score( y_test_conf[: , 3] ,mlb.transform(test_grp_df2.aspects_pred)[:,3] ))
print("service" ,   accuracy_score( y_test_conf[: , 4] ,mlb.transform(test_grp_df2.aspects_pred)[:,4] ))

Accuracy for different aspect categories 
ambience 0.6796116504854369
anecdotes/miscellaneous 0.7572815533980582
food 0.8446601941747572
price 0.8058252427184466
service 0.7572815533980582


The overall accuracy for test data is around 86% and F1 score is 66% . We will try some LSTM + Word embedding based techniques to improve the performace of the multi label classification problem.