# Aspect category classification using Deep Learning

In [1]:
import torch 
from torchtext.data import Field
from torchtext.data import Iterator , BucketIterator

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth' , -1)
import torch 
import torch.nn as nn

from torchtext import data
from torchtext import datasets

from torchtext.data import Field 
from torchtext.data import Iterator , BucketIterator
import torch.optim as optim
import os

In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
train_grp_df = pd.read_csv('../data/resturant_train_stratified_grouped.csv')
test_grp_df  = pd.read_csv('../data/resturant_test_stratified_grouped.csv')
print('Train data', train_grp_df.shape) 
print('Test data',test_grp_df.shape)

import ast 
train_grp_df['aspects'] = train_grp_df['aspects'].apply(lambda x: ast.literal_eval(x))
train_grp_df['polarities'] = train_grp_df['polarities'].apply(lambda x: ast.literal_eval(x))

test_grp_df['aspects'] = test_grp_df['aspects'].apply(lambda x: ast.literal_eval(x))
test_grp_df['polarities'] = test_grp_df['polarities'].apply(lambda x: ast.literal_eval(x))

Train data (2206, 5)
Test data (649, 5)


Changing column format to input it to TorchText data loader

In [5]:
aspects = ['ambience' , 'anecdotes/miscellaneous' , 'food' , 'service' , 'price']
for col in aspects:
    train_grp_df[col] = train_grp_df['aspects'].apply(lambda x: 1 if col in x else 0)
    test_grp_df[col] = test_grp_df['aspects'].apply(lambda x: 1 if col in x else 0)

In [6]:
train_grp_df.drop(columns=['aspects' , 'polarities' , 'length'] , inplace=True)
test_grp_df.drop(columns=['aspects' , 'polarities' , 'length'] , inplace=True)


In [7]:
train_grp_df[['text'] + aspects].to_csv('../data/resturant_train_stratified_grouped2.csv' , index = False)
test_grp_df[['text'] + aspects].to_csv('../data/resturant_test_stratified_grouped2.csv' , index = False)

In [8]:
tokenize = 'spacy'
TEXT = Field(sequential=True , tokenize=tokenize , lower=True , include_lengths=True)
LABEL = Field(sequential=False , use_vocab=False, dtype=torch.float) 

In [9]:
from torchtext.data import TabularDataset
 
train_datafields = [ ('text' , TEXT) , ('ambience' , LABEL) , ('anecdotes/miscellaneous' , LABEL) , 
                   ('food' , LABEL) , ('price' , LABEL) , ('service' , LABEL) ]
train_data, valid_data  = TabularDataset.splits(
            path='../data',  
            train='resturant_train_stratified_grouped2.csv',   
            validation='resturant_test_stratified_grouped2.csv',      
            format='csv',
            skip_header=True,  fields=train_datafields )


test_datafields = [('text' , TEXT) , ('ambience' , LABEL) , ('anecdotes/miscellaneous' , LABEL) , 
                   ('food' , LABEL) , ('price' , LABEL) , ('service' , LABEL) ]

test_data = TabularDataset(
            path = '../data/resturant_test_stratified_grouped2.csv' ,
            format='csv' , skip_header=True , 
            fields=test_datafields)

In [10]:
test_data[0].__dict__.keys()

dict_keys(['text', 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'])

In [11]:
np.array(test_data[1].__dict__['text'])

array(['i', 'recommend', 'to', 'anyone', 'who', 'wants', 'to', 'dress',
       'up', 'and', 'impress', 'the', 'lady', '.'], dtype='<U9')

#### We will use Glove word embeddings and keep max vocab size or around 4000

In [12]:
MAX_VOCAB_SIZE = 4000
BATCH_SIZE = 64
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors =  "glove.42B.300d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [13]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of test examples: {len(test_data)}')

Number of training examples: 2206
Number of validation examples: 649
Number of test examples: 649


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
train_iter, val_iter = BucketIterator.splits(
    (train_data, valid_data),
    batch_sizes=(BATCH_SIZE, BATCH_SIZE),
    device=device,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    repeat=False
)

test_iter = Iterator(test_data, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False ,
                 train=False , shuffle=False )

In [16]:
class BatchWrapper:
    def __init__(self, iterator, x_var, y_vars):
        self.iterator, self.x_var, self.y_vars = iterator, x_var, y_vars
  
    def __iter__(self):
        for batch in self.iterator:
            x = getattr(batch, self.x_var)
            if self.y_vars is not None:
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))
            yield (x, y)
    def __len__(self):
        return len(self.iterator)

train_loader = BatchWrapper(train_iter, "text", [ 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'])
valid_loader = BatchWrapper(val_iter, "text", [ 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'])
test_loader = BatchWrapper(test_iter, "text"   ,[ 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'])

In [17]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)        
        self.rnn = nn.LSTM(embedding_dim,  hidden_dim, num_layers=n_layers, 
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2 , output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence( embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        #output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)       
        hidden =(torch.cat((hidden[-2,:,:],  hidden[-1,:,:]), dim = 1))  
   
        return self.fc(hidden.squeeze(0))

In [18]:

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 100
OUTPUT_DIM = 5
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

  "num_layers={}".format(dropout, num_layers))


In [19]:
optimizer = optim.Adam(model.parameters() , lr = 1e-2)
loss_func = nn.BCEWithLogitsLoss()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,441,605 trainable parameters


In [20]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.0072,  0.0380,  0.0675,  ...,  0.0396,  0.0018,  0.0760],
        [-0.1117, -0.4966,  0.1631,  ..., -1.4447,  0.8402, -0.8668],
        [ 0.1088,  0.0022,  0.2221,  ..., -0.2970,  0.1594, -0.1490],
        ...,
        [-0.2680, -0.9008,  0.1308,  ...,  0.2303, -0.5960,  0.3290],
        [-0.5858, -0.3707, -0.1245,  ..., -0.0055, -0.8436,  0.0873],
        [ 0.3440,  0.8542, -0.5785,  ...,  0.1280,  0.0526,  0.3247]])

In [21]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [22]:
model = model.to(device)
loss_func = loss_func.to(device)

In [23]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    acc = torch.sum(rounded_preds == y).float() 
    return acc

In [24]:
def train_model(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0    
    model.train()
    ct = 0
    for x, y in iterator:
        optimizer.zero_grad()
        text, text_lengths =x
        predictions = model(text, text_lengths)
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        ct = ct + text.shape[1]   
    return epoch_loss / len(iterator), epoch_acc / (ct*5)

In [25]:
def validate_model(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0 
    ct = 0
    model.eval()
    with torch.no_grad():
        for x ,y  in iterator:
            text, text_lengths = x
            predictions = model(text, text_lengths)#.squeeze(1)
            loss = criterion(predictions,y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            ct = ct + text.shape[1]
        
    return epoch_loss / len(iterator), epoch_acc / (ct*5)

In [26]:
epochs = 5

best_valid_loss = float('inf')
best_epoch = 0
        
for epoch in range(1, epochs + 1):
    train_loss , train_acc = train_model(model, train_loader, optimizer, loss_func)
    valid_loss , valid_acc = validate_model(model, valid_loader, loss_func)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_epoch = epoch
        print(f'Best validation loss!! {best_valid_loss}')
        torch.save(model.state_dict(), 'aspect_category.pt')
    #print(f'Epoch: {epoch}, Training Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Best validation loss!! 0.3941935653036291
Epoch: 02
	Train Loss: 0.418 | Train Acc: 81.50%
	 Val. Loss: 0.394 |  Val. Acc: 84.68%
Best validation loss!! 0.27863858301531186
Epoch: 03
	Train Loss: 0.211 | Train Acc: 91.77%
	 Val. Loss: 0.279 |  Val. Acc: 90.02%
Best validation loss!! 0.2771649191325361
Epoch: 04
	Train Loss: 0.116 | Train Acc: 95.61%
	 Val. Loss: 0.277 |  Val. Acc: 90.42%
Epoch: 05
	Train Loss: 0.061 | Train Acc: 97.77%
	 Val. Loss: 0.344 |  Val. Acc: 90.32%
Epoch: 06
	Train Loss: 0.036 | Train Acc: 98.86%
	 Val. Loss: 0.364 |  Val. Acc: 90.26%


In [27]:
test_preds = []
text_data = []
true_label = []

epoch_loss = 0
epoch_acc = 0 
ct = 0
model.eval()
with torch.no_grad():
    for x ,y  in test_loader:
        text, text_lengths = x
        text_data.append(text.data.cpu().numpy())
        predictions = model(text, text_lengths)#.squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        test_preds.append(preds)
        true_label.append(y.data.cpu().numpy())


In [28]:
"""
Merging prediction value with original test data and observe the metrics on overall level
"""
test_pred = pd.DataFrame(np.vstack(test_preds) ,index=test_grp_df.index , columns= [a+'_pred' for a in aspects])
test_aspect_pred = pd.merge(test_grp_df, test_pred , left_index=True ,right_index = True)

from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score

In [29]:
print("F1 score",f1_score(test_aspect_pred[aspects].as_matrix() , test_aspect_pred[test_aspect_pred.columns[-5:]].as_matrix() , average='macro' ))
print("Accuracy score" , np.mean(test_aspect_pred[aspects].as_matrix() == test_aspect_pred[test_aspect_pred.columns[-5:]].as_matrix()))

F1 score 0.805298415421371
Accuracy score 0.9026194144838212


  """Entry point for launching an IPython kernel.
  


In [30]:
print('Accuracy for different aspect category ')
for col in aspects:
    print(col , ': ', test_aspect_pred[test_aspect_pred[col]==test_aspect_pred[col+'_pred']].shape[0]/test_aspect_pred.shape[0])

Accuracy for different aspect category 
ambience :  0.9090909090909091
anecdotes/miscellaneous :  0.8228043143297381
food :  0.8859784283513097
service :  0.9322033898305084
price :  0.963020030816641


### Look for the accuracy of conflicting statements (sentiment changing within a sentence

In [31]:
test2_conflict = test_aspect_pred[test_aspect_pred['ind']==1]
print("F1 score",f1_score(test2_conflict[aspects].as_matrix() , test2_conflict[test2_conflict.columns[-5:]].as_matrix() , average='macro' ))
print("Accuracy score" , np.mean(test2_conflict[aspects].as_matrix() == test2_conflict[test2_conflict.columns[-5:]].as_matrix()))

F1 score 0.7492912295251918
Accuracy score 0.8368932038834952


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
print('Accuracy for different aspect category ')
for col in aspects:
    print(col , ': ', test2_conflict[test2_conflict[col]==test2_conflict[col+'_pred']].shape[0]/test2_conflict.shape[0])

Accuracy for different aspect category 
ambience :  0.8349514563106796
anecdotes/miscellaneous :  0.7378640776699029
food :  0.8737864077669902
service :  0.8349514563106796
price :  0.9029126213592233


We can observe that accuracy and F1 score has improved significantly from TfIdf vector based classifier. We will use prediction of this model to find aspect categories in statement and then feed that to sentiment classifier

In [33]:
def aspect_pred(x):
    res = []
    for col in aspects:
        if x[col+'_pred']==1:
            res.append(col)
    return res

test_aspect_pred['aspects_pred'] = test_aspect_pred.apply(lambda x: aspect_pred(x) ,axis =1)
test_aspect_pred.drop(columns=['anecdotes/miscellaneous_pred', 'food_pred', 'service_pred', 'price_pred', 'ambience_pred'] , inplace=True)

In [34]:
def aspect_pred(x):
    res = []
    for col in aspects:
        if x[col]==1:
            res.append(col)
    return res

test_aspect_pred['aspects'] = test_aspect_pred.apply(lambda x: aspect_pred(x) ,axis =1)
test_aspect_pred.drop(columns=aspects , inplace=True)
test_aspect_pred.to_csv('../output/BiLSTM_aspect_prediction_test_data.csv' , index = False)

#### Using aspect prediction on test data , we have segmented some sentence using constituency parsing. Please go through: 2- Sentence segmentation using constituency parsing.ipynb

In [36]:
import ast
test_segments = pd.read_csv('../output/test_data_sentence_segmentation.csv')
for col in ['aspects_pred' , 'aspects' , 'splits']:
    test_segments[col]  = test_segments[col].apply(lambda x: ast.literal_eval(x))
    
test_segments['length_splits'] = test_segments['splits'].apply(lambda x: len(x))
test_segments.length_splits.value_counts()

1    598
2    45 
3    5  
4    1  
Name: length_splits, dtype: int64

In [38]:
single_aspect_df = test_segments[(test_segments['length_splits']==1)] # & (test_aspect_pred['ct_aspects']<=1)
multi_aspect_df = test_segments[test_segments['length_splits']>1]
multi_aspect_df.drop(columns=['aspects' , 'ct_aspects', 'length_splits' ] , inplace=True)
print('Reviews with only one aspect or no segment resulting from constituency parsing' , single_aspect_df.shape[0] )
print('Reviews with more than one sentence segment resulting from constituency parsing ' , multi_aspect_df.shape[0])

Reviews with only one aspect or no segment resulting from constituency parsing 598
Reviews with more than one sentence segment resulting from constituency parsing  51


For reviews with different segments mapping to multiple aspect categories, we will use pretrained Bi-LSTM model to predict most prominent aspect category in sentence split. 

In [39]:
multi_aspects_splits= []
for index, row in multi_aspect_df.iterrows():                   
    total_aspects = row['aspects_pred'].copy()
    total_sent = row['splits'].copy()    
    for sent in total_sent:
        multi_aspects_splits.append({  'id': index , 'sp_text': ' '.join(sent) , 'aspects_pred': total_aspects })

df_multi_aspects_splits = pd.DataFrame(multi_aspects_splits)
df_multi_aspects_splits[['sp_text','id']].to_csv('../output/test_data_split_ungrped.csv' , index=False)

In [40]:
test_datafields = [('sp_text' , TEXT) ]
test_data = TabularDataset( path = '../output/test_data_split_ungrped.csv' ,  format='csv' , skip_header=True ,  fields=test_datafields)
test_iter2 = Iterator(test_data, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False , train=False , shuffle=False )
test_loader = BatchWrapper(test_iter2, "sp_text" , None )

In [41]:
test_preds = []
model.eval()
with torch.no_grad():
    for x ,y  in test_loader:
        text, text_lengths = x
        text_data.append(text.data.cpu().numpy())
        predictions = model(text, text_lengths)#.squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        test_preds.append(preds)


In [42]:
testpreddf = pd.DataFrame(np.vstack(test_preds) ,index=df_multi_aspects_splits.index , columns= [a+'_pred' for a in aspects])
df_multi_aspects_splits = pd.merge(df_multi_aspects_splits, testpreddf , left_index=True ,right_index = True)

In [44]:
def aspect_pred(x):
    res = []
    for col in aspects: 
        if x[col+'_pred']==1:  res.append(col)
    return res

df_multi_aspects_splits['phrase_aspect'] = df_multi_aspects_splits.apply(lambda x: aspect_pred(x) ,axis =1)
df_multi_aspects_splits.drop(columns=['anecdotes/miscellaneous_pred', 'food_pred', 'service_pred', 'price_pred', 'ambience_pred'] , inplace=True)
df_multi_aspects_splits['remaining_aspect'] = df_multi_aspects_splits.apply(lambda x: list(set(x['aspects_pred']) - set(x['phrase_aspect']))  , axis =1)
df_multi_aspects_splits['remaining_aspect'] = df_multi_aspects_splits['remaining_aspect'].shift(1)

In [45]:
df_multi_aspects_splits['shift_id'] = df_multi_aspects_splits['id'].shift(1)
df_multi_aspects_splits.shift_id.fillna(0 , inplace = True)
df_multi_aspects_splits['shift_id'] =  df_multi_aspects_splits.apply(lambda x: 1 if x['id']!= x['shift_id']  else 0 , axis = 1 )
df_multi_aspects_splits['remaining_aspect']  = df_multi_aspects_splits.apply(lambda x: [] if x['shift_id']==1 else x['remaining_aspect'], axis = 1)

In [47]:
df_multi_aspects_splits2= []
for index, row in df_multi_aspects_splits.iterrows():                   
    total_aspects = row['phrase_aspect'].copy() 
    if len(total_aspects) ==1:
        df_multi_aspects_splits2.append({'id': row['id']  , 'sp_text':row['sp_text'] ,'fn_aspect': total_aspects[0] })
    else :
        if len(row['remaining_aspect']) ==1:
            df_multi_aspects_splits2.append({'id': row['id']  , 'sp_text':row['sp_text'] ,'fn_aspect': row['remaining_aspect'][0] })
        elif len(row['remaining_aspect']) ==0:
            for asp in row['phrase_aspect']:
                df_multi_aspects_splits2.append({'id': row['id'], 'sp_text':row['sp_text'],'fn_aspect':asp})
        else:
            for asp in row['remaining_aspect']:
                df_multi_aspects_splits2.append({'id': row['id'], 'sp_text':row['sp_text'],'fn_aspect':asp})
                

df1_aspect_lvl= pd.DataFrame(df_multi_aspects_splits2)
df1_aspect_lvl.shape

In [50]:
multi_aspect_df.head(2)

Unnamed: 0,text,ind,aspects_pred,splits
3,We were seated outside and the waiter spilled red wine and hot tea on myself and my date.,0,"[food, service]","[[We, were, seated, outside], [the, waiter, spilled, red, wine, and, hot, tea, on, myself, and, my, date]]"
4,"The crust is thin, the ingredients are fresh and the staff is friendly.",0,"[food, service]","[[The, crust, is, thin], [the, ingredients, are, fresh], [the, staff, is, friendly]]"


In [56]:
single_aspect= []
for index, row in single_aspect_df.iterrows():
    try:
        aspect = row['aspects_pred'][0]
    except:
        aspect =''
    single_aspect.append({'id': index, 'sp_text':row['text'],'fn_aspect':aspect})               

df2_aspect_lvl= pd.DataFrame(single_aspect)
df2_aspect_lvl.shape

(598, 3)

In [60]:
aspect_lvl_dataframe = pd.concat([df1_aspect_lvl , df2_aspect_lvl] , axis = 0)
aspect_lvl_dataframe.id.nunique()

In [66]:
aspect_lvl_dataframe.sort_values(by = ['id']  , inplace=True)
aspect_lvl_dataframe.reset_index(inplace= True , drop = True)

In [67]:
aspect_lvl_dataframe.to_csv('../output/aspect_level_test_data.csv' , index = False)