In [1]:
import sys
import re
import os
from pathlib import Path
from collections import namedtuple
import numpy as np
import pandas as pd
import pickle
from collections import Counter, defaultdict
import ast
pd.set_option('display.max_colwidth' , -1)

#### English - Restaurant domain training data

In [2]:
eng_multi_aspects = pd.read_csv('../data/English_restaurants.csv')
eng_multi_aspects['aspects'] = eng_multi_aspects['aspects'].apply(lambda x: ast.literal_eval(x))
eng_multi_aspects['polarities'] = eng_multi_aspects['polarities'].apply(lambda x: ast.literal_eval(x))
eng_multi_aspects.head(2)

Unnamed: 0,text,aspects,polarities
0,"Judging from previous posts this used to be a good place, but not any longer.",[RESTAURANT],[negative]
1,"We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.",[SERVICE],[negative]


In [3]:
eng_multi_aspects.aspects.apply(pd.Series).merge(eng_multi_aspects , right_index = True , left_index = True)\
.drop(['aspects' , 'polarities'] ,axis = 1).melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna().value.value_counts(normalize = True)

FOOD          0.365171
RESTAURANT    0.272069
SERVICE       0.202123
AMBIENCE      0.109021
DRINKS        0.038109
LOCATION      0.013507
Name: value, dtype: float64

#### Dutch  - Restaurant domain training data

In [4]:
du_multi_aspects = pd.read_csv('../data/Dutch_restaurants.csv')
du_multi_aspects['aspects'] = du_multi_aspects['aspects'].apply(lambda x: ast.literal_eval(x))
du_multi_aspects['polarities'] = du_multi_aspects['polarities'].apply(lambda x: ast.literal_eval(x))
du_multi_aspects.head(2)

Unnamed: 0,text,aspects,polarities
0,Lange wachttijd.,[SERVICE],[negative]
1,"Zelfde dessert, 2 dagen na mekaar.",[FOOD],[negative]


#### Spanish Restaurant domain training data

In [5]:
spanish_multi_aspects = pd.read_csv('../data/Spanish_restaurants.csv')
spanish_multi_aspects['aspects'] = spanish_multi_aspects['aspects'].apply(lambda x: ast.literal_eval(x))
spanish_multi_aspects['polarities'] = spanish_multi_aspects['polarities'].apply(lambda x: ast.literal_eval(x))
spanish_multi_aspects.head(2)

Unnamed: 0,text,aspects,polarities
0,Nos sentimos muy a gusto.,[RESTAURANT],[positive]
1,"Buen servicio, ambiente Acogedor y tranquilo, comida bien.","[FOOD, SERVICE, AMBIENCE]","[positive, positive, positive]"


In [6]:
eng_multi_aspects[['text']].to_csv('../data/processed/en_resturant.txt' , header = None , index = None , mode = 'w')
du_multi_aspects[['text']].to_csv('../data/processed/nl_resturant.txt' , header = None , index = None , mode = 'w')
spanish_multi_aspects[['text']].to_csv('../data/processed/es_resturant.txt' , header = None , index = None , mode = 'w')


### Extract sentence embeddings from text column of restaurant reviews

In [7]:
LASER_PATH = ".."
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

DATA_PATH = Path("../data/processed/")
CACHE_PATH = Path("cache/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

os.environ["LASER"] = LASER_PATH 
SPACE_NORMALIZER = re.compile("\s+")
Batch = namedtuple('Batch', 'srcs tokens lengths')

from indexing import IndexLoad, IndexTextOpen, IndexTextQuery, IndexSearchKNN, IndexCreate, IndexSearchMultiple
from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply

In [8]:
encoder = SentenceEncoder(
    str(MODEL_PATH/"bilstm.93langs.2018-12-26.pt"),
    max_sentences=None,
    max_tokens=10000,
    cpu=False)
bpe_codes = str(MODEL_PATH/"93langs.fcodes")

#### Following steps from https://medium.com/the-artificial-impostor/multilingual-similarity-search-using-pretrained-bidirectional-lstm-encoder-e34fac5958b0 for tokenization , BPE Fast and Embedding extractions 

In [9]:

for lang in ( "en","nl", 'es'):  ##"zh" for chinese , nl  for dutch and es for spanish
    Token(
        str(DATA_PATH / f"{lang}_resturant.txt"), ##english_resturant.txt
        str(CACHE_PATH / f"{lang}_resturant.txt"),
        lang=lang,
        romanize=False,
        lower_case=True, gzip=False,
        verbose=True)
    BPEfastApply(
        str(CACHE_PATH / f"{lang}_resturant.txt"),
        str(CACHE_PATH / f"{lang}_resturant.bpe"),
        bpe_codes,
        verbose=True, over_write=True)
    EncodeFile(
        encoder,
        str(CACHE_PATH / f"{lang}_resturant.bpe"),
        str(CACHE_PATH / f"{lang}_resturant.enc"),
        verbose=True, over_write=True)  

data_en, index_en = IndexCreate(
    str(CACHE_PATH / "en_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
data_du, index_du = IndexCreate(
    str(CACHE_PATH / "nl_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
data_spanish, index_spanish = IndexCreate(
    str(CACHE_PATH / "es_resturant.enc"), 'FlatL2', verbose=True, save_index=False)

 - Tokenizer: en_resturant.txt in language en  
 - fast BPE: processing en_resturant.txt
 - Encoder: en_resturant.bpe to en_resturant.enc
 - Encoder: 1708 sentences in 0s
 - Tokenizer: nl_resturant.txt in language nl  
 - fast BPE: processing nl_resturant.txt
 - Encoder: nl_resturant.bpe to nl_resturant.enc
 - Encoder: 1317 sentences in 0s
 - Tokenizer: es_resturant.txt in language es  
 - fast BPE: processing es_resturant.txt
 - Encoder: es_resturant.bpe to es_resturant.enc
 - Encoder: 1626 sentences in 0s
 - embedding: cache/en_resturant.enc 1708 examples of dim 1024
 - creating FAISS index
 - embedding: cache/nl_resturant.enc 1317 examples of dim 1024
 - creating FAISS index
 - embedding: cache/es_resturant.enc 1626 examples of dim 1024
 - creating FAISS index


### Creating multi label classification task using sentence embeddings-  zero shot cross lingual training 
Training and hyper parameter optimisation in english language. Prediction in Dutch and Spanish language.
Creating train valdiation split on English reviews. Creating target using MultiLabelBinarizer on Training Validation , Dutch and Spanish reviews. Normalizing the embeddings using StandardScaler (resulted in improvement in results)

In [10]:
from sklearn.model_selection import train_test_split
train_aspects , val_aspects, train_df , val_df = train_test_split(eng_multi_aspects, data_en , test_size = 0.2 , random_state = 42)

from sklearn.preprocessing import MultiLabelBinarizer
mlb  = MultiLabelBinarizer()
tr_eng = mlb.fit_transform(train_aspects.aspects)
val_eng = mlb.transform(val_aspects.aspects)
y_du  = mlb.transform(du_multi_aspects.aspects)
y_spainish  = mlb.transform(spanish_multi_aspects.aspects)

train_aspects.reset_index(inplace=True , drop= True)
val_aspects.reset_index(inplace=True , drop= True)

from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler().fit(train_df)
train_std = std_scale.transform(train_df) 
val_std = std_scale.transform(val_df)
dutch_std = std_scale.transform(data_du)
spanish_std = std_scale.transform(data_spanish)

#### Creating datasets for pytorch based multi label classification model

In [11]:
import torch 
import torch
import torch.nn as nn

x_train,y_train,x_valid,y_valid , x_test , y_test  , x_test_sp , y_test_sp = map(torch.FloatTensor, (train_std,tr_eng,  val_std ,\
                                                                            val_eng, dutch_std,y_du, \
                                                                           spanish_std ,y_spainish ))
n,c = x_train.shape
y_train = y_train.type(torch.FloatTensor)
y_valid = y_valid.type(torch.FloatTensor)
y_test = y_test.type(torch.FloatTensor)
y_test_sp = y_test_sp.type(torch.FloatTensor)

print(y_train.shape , y_valid.shape , y_test.shape , y_test_sp.shape)
print(x_train.shape , x_valid.shape , x_test.shape , x_test_sp.shape)
batch_size = 64

torch.Size([1366, 6]) torch.Size([342, 6]) torch.Size([1317, 6]) torch.Size([1626, 6])
torch.Size([1366, 1024]) torch.Size([342, 1024]) torch.Size([1317, 1024]) torch.Size([1626, 1024])


In [12]:
class Model(nn.Module):
    def __init__(self , p):
        super().__init__()
        self.hidden = nn.Linear(1024, 512)
        self.hidden2 = nn.Linear(512 , 256)
        self.hidden3 =  nn.Linear(256 , 128)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.fc = nn.Linear(128, 6)

    def forward(self, x):
        x = self.activation(self.dropout(self.hidden(x)))
        x = self.activation(self.dropout(self.hidden2(x)))
        x = self.activation(self.dropout(self.hidden3(x)))
        x = self.fc(x)
        return x

In [13]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size , shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid )
valid_dl = DataLoader(valid_ds , batch_size= batch_size)

test_ds = TensorDataset(x_test , y_test)
test_dl = DataLoader(test_ds , batch_size=batch_size)

test_ds2 = TensorDataset(x_test_sp , y_test_sp)
test_dl2 = DataLoader(test_ds2 , batch_size=batch_size)

In [14]:
class WrappedDataLoader():
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func
        
    def __len__(self): return len(self.dl)
    
    def __iter__(self):
        batches = iter(self.dl)
        for b in batches: yield(self.func(*b))

In [15]:
dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def preprocess(x,y): return x.to(dev),y.to(dev)

train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)
test_dl = WrappedDataLoader(test_dl , preprocess)
test_dl2 = WrappedDataLoader(test_dl2 , preprocess)

In [16]:

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.mean() #/ (len(correct))
    return acc


def fbeta_score(y_true, y_pred, beta, threshold, eps=1e-9):
    beta2 = beta**2

    y_pred = torch.ge(torch.sigmoid(y_pred).float(), threshold).float()
    y_true = y_true.float()

    true_positive = (y_pred * y_true).sum(dim=0)
    precision = true_positive.div(y_pred.sum(dim=0).add(eps))
    recall = true_positive.div(y_true.sum(dim=0).add(eps))
    
    return torch.mean(
        (precision*recall).
        div(precision.mul(beta2) + recall + eps).
        mul(1 + beta2)) , torch.mean(precision) , torch.mean(recall)


def f1_score(y_pred,y_true, threshold=0.5):
    f1 , precision , recall = fbeta_score(y_true, y_pred, 1, threshold) 
    return f1 , precision , recall

In [17]:
def train_model(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0  
    epoch_f1 = 0 ; epoch_precision = 0 ; epoch_recall = 0
    model.train()
    ct = 0
    for x, y in iterator:
        optimizer.zero_grad()
        predictions = model(x)
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        f1 , precision , recall = f1_score(predictions , y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1.item() 
        epoch_precision += precision.item()  
        epoch_recall += recall.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator) , epoch_f1/len(iterator), epoch_precision/len(iterator), epoch_recall/len(iterator)

In [18]:
def validate_model(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0 
    epoch_f1 = 0; epoch_precision = 0 ; epoch_recall = 0
    model.eval()
    with torch.no_grad():
        for x ,y  in iterator:

            predictions = model(x)#.squeeze(1)
            loss = criterion(predictions,y)
            acc = binary_accuracy(predictions, y) ; f1 , precision , recall = f1_score(predictions , y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1 += f1.item()   ; epoch_precision += precision.item()  ; epoch_recall += recall.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator) , epoch_f1/len(iterator), epoch_precision/len(iterator), epoch_recall/len(iterator)

#### We will use weight to scale loss value . It will result in giving equal weight to each category irrespective of data imbalance.

In [19]:
df_data_ratio = train_aspects.aspects.apply(pd.Series).merge(train_aspects , right_index = True , left_index = True)\
.drop([ 'aspects' , 'polarities'] ,axis = 1).melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna().value.value_counts(normalize = True).reset_index()

In [20]:
df_data_ratio

Unnamed: 0,index,value
0,FOOD,0.361111
1,RESTAURANT,0.270531
2,SERVICE,0.198671
3,AMBIENCE,0.117754
4,DRINKS,0.038043
5,LOCATION,0.013889


In [21]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

        
weight_list =  [1/df_data_ratio[df_data_ratio['index']==c]['value'].values[0]  for c in mlb.classes_]
weights = torch.tensor( weight_list)
weights =weights.to(dev)

from torch import optim


#### Apply grid search on Learning Rate, Weight Decay , Dropout parameters save the model with best validation loss and validation f1 score.

In [22]:
import random 
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [23]:
best_valid_f1 = -float('inf') ; best_valid_loss = float('inf')
loss_func = nn.BCEWithLogitsLoss(weight=weights) 
loss_func = loss_func.to(dev)
for drp in [0.2, 0.3,0.4,0.5,0.6]:
    for wd in [0.1 , 0.05 , 0.01 , 0.005 , 0.001]:
        for learning_rate in [1e-2 , 5e-3 , 1e-3]:
            model = Model(drp); model.apply(init_weights)
            model = model.to(dev)
            optimizer = optim.Adam(model.parameters() , lr = learning_rate, weight_decay=wd) #[a+'_pred' for a in aspects]
            model = model.to(dev)
            epochs = 10
            for epoch in range(1, epochs + 1):
                train_loss , train_acc , train_f1 , train_precision , train_recall = train_model(model, train_dl, optimizer, loss_func)
                valid_loss , valid_acc , valid_f1 , valid_precision , valid_recall  = validate_model(model, valid_dl, loss_func)
                if (valid_loss < best_valid_loss) & (valid_f1 > best_valid_f1)  & (abs(train_f1- valid_f1) <= 0.05):
                    best_valid_f1 = valid_f1 ; best_valid_loss = valid_loss
                    print('train data' , train_acc , train_f1 , train_precision , train_recall)
                    print('valid data' , valid_acc ,  valid_f1 , valid_precision , valid_recall)


                    print("Parameters: " ,'Dropout: ' ,  drp , 'weight decay: ' ,wd ,' learning rate : ' ,learning_rate )
                    if os.path.isfile('results/multi_label_problem.pt'):
                        os.remove('results/multi_label_problem.pt') 
                                           
                    torch.save(model.state_dict(), 'results/multi_label_problem.pt')

  This is separate from the ipykernel package so we can avoid doing imports until


train data 0.7798833684487776 0.21473451297391544 0.2819275476715781 0.23644619773734699
valid data 0.8620581030845642 0.22622336695591608 0.25614674886067706 0.21116616825262705
Parameters:  Dropout:  0.2 weight decay:  0.1  learning rate :  0.01
train data 0.8516593683849681 0.33419464934955945 0.4514863660389727 0.3033361340110952
valid data 0.848248134056727 0.28686009099086124 0.41156866649786633 0.2589774827162425
Parameters:  Dropout:  0.2 weight decay:  0.1  learning rate :  0.01
train data 0.8655733655799519 0.3972922902215611 0.4685949601910331 0.36998756568540225
valid data 0.8607165614763895 0.3591578851143519 0.49420441687107086 0.32077595591545105
Parameters:  Dropout:  0.2 weight decay:  0.1  learning rate :  0.01
train data 0.8683174279603091 0.41430715132843365 0.489790984175422 0.37984616512602026
valid data 0.8783538738886515 0.36711075405279797 0.49450937906901044 0.3139963895082474
Parameters:  Dropout:  0.2 weight decay:  0.1  learning rate :  0.01
train data 0.85

In [24]:
loss_func = nn.BCEWithLogitsLoss(weight=weights) 
loss_func = loss_func.to(dev)
model = Model(0.5)
model.load_state_dict(torch.load('results/multi_label_problem.pt'))
model = model.to(dev)
validate_model(model, valid_dl, loss_func)

(1.7852152188618977,
 0.908775269985199,
 0.5151392469803492,
 0.6221395631631216,
 0.47141974171002704)

##### Calculate aspect prediction for validation dataset 

In [25]:
val_preds = []
val_label = []
with torch.no_grad():
    for x ,y  in valid_dl:
        predictions = model(x)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        val_preds.append(preds)
        val_label.append(y.data.cpu().numpy())

val_aspects['aspects_pred'] = pd.Series(mlb.inverse_transform(np.vstack(val_preds)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


##### Calculate aspect prediction for dutch test dataset 

In [26]:
test_preds = []
true_label = []
with torch.no_grad():
    for x ,y  in test_dl:
        predictions = model(x)#.squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        test_preds.append(preds)
        true_label.append(y.data.cpu().numpy())
        
du_multi_aspects['aspects_pred']  = pd.Series(mlb.inverse_transform(np.vstack(test_preds)))

aspects = mlb.classes_.tolist()
"""
Merging prediction value with original test data and observe the metrics on overall level
"""
dutch_pred = pd.DataFrame(np.vstack(test_preds) ,index=du_multi_aspects.index , columns= [a+'_pred' for a in aspects])
dutch_pred2 = pd.merge(du_multi_aspects, dutch_pred , left_index=True ,right_index = True)

from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score

print("F1 score",f1_score(y_du , dutch_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Accuracy score" , np.mean(y_du == dutch_pred2[[a+'_pred' for a in aspects]].as_matrix()))
print("Precision score",precision_score(y_du , dutch_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Recall score",recall_score(y_du , dutch_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))


F1 score 0.5371728544695706
Accuracy score 0.8720577069096431
Precision score 0.7801170234947093
Recall score 0.4685074726379324




##### Calculate aspect prediction for spanish test dataset 

In [27]:
test_preds = []
true_label = []
with torch.no_grad():
    for x ,y  in test_dl2:
        predictions = model(x)#.squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        test_preds.append(preds)
        true_label.append(y.data.cpu().numpy())
        
spanish_multi_aspects['aspects_pred']  = pd.Series(mlb.inverse_transform(np.vstack(test_preds)))

        
aspects = mlb.classes_.tolist()
"""
Merging prediction value with original test data and observe the metrics on overall level
"""
spanish_pred = pd.DataFrame(np.vstack(test_preds) ,index=spanish_multi_aspects.index , columns= [a+'_pred' for a in aspects])
spanish_pred2 = pd.merge(spanish_multi_aspects, spanish_pred , left_index=True ,right_index = True)

from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score

print("F1 score",f1_score(y_spainish , spanish_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Accuracy score" , np.mean(y_spainish == spanish_pred2[[a+'_pred' for a in aspects]].as_matrix()))
print("Precision score",precision_score(y_spainish , spanish_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Recall score",recall_score(y_spainish , spanish_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))

F1 score 0.5451493396566448
Accuracy score 0.8865313653136532
Precision score 0.7285837584595787
Recall score 0.4719908182638046




#### Next step is sentiment classification. We will work with only correct Text X Aspect results to predict sentiment.  Hence will will create an extra column aspects_pred for validation and test datasets.  Then we will filter out the correct  Text X Aspect combinations.

In [28]:
val_aspects['aspects_pred'] = val_aspects['aspects_pred'].apply(lambda x: list(x))
du_multi_aspects['aspects_pred'] = du_multi_aspects['aspects_pred'].apply(lambda x: list(x))
spanish_multi_aspects['aspects_pred'] = spanish_multi_aspects['aspects_pred'].apply(lambda x: list(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
def ungrp(train_df):
    asp_df = train_df.aspects.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects' , 'polarities' , 'aspects_pred'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    polarity_df = train_df.polarities.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects' , 'polarities' , 'aspects_pred'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    train_df_ungrp = pd.merge(asp_df , polarity_df['value'] , left_index = True , right_index = True ,suffixes=('_aspects' , '_polarities'))
    train_df_ungrp.rename(columns={'value_aspects' : 'aspects' , 'value_polarities':'polarities'} , inplace=True)

    train_df_ungrp2 = pd.merge(train_df_ungrp , train_df[[ 'text' ,'aspects_pred']] , on ='text')
    return train_df_ungrp2


In [30]:
val_aspects_ungrp  = ungrp(val_aspects)
du_aspects_ungrp = ungrp(du_multi_aspects)
sp_aspects_ungrp= ungrp(spanish_multi_aspects)

In [31]:
du_aspects_ungrp.shape , sp_aspects_ungrp.shape , val_aspects_ungrp.shape

((1629, 4), (2321, 4), (417, 4))

In [32]:
def check_ind(x):
    asp = x['aspects_pred']
    if x['aspects'] in asp:
        return 1
    else:
        return 0

In [33]:
val_aspects_ungrp['ind'] = val_aspects_ungrp.apply(lambda x: check_ind(x) , axis = 1)
du_aspects_ungrp['ind'] = du_aspects_ungrp.apply(lambda x: check_ind(x) , axis = 1)
sp_aspects_ungrp['ind'] = sp_aspects_ungrp.apply(lambda x: check_ind(x) , axis = 1)

In [34]:
val_aspects_ungrp.ind.mean() , du_aspects_ungrp.ind.mean() , sp_aspects_ungrp.ind.mean()

(0.6930455635491607, 0.5960712093308779, 0.6604911676001723)

In [35]:
def ungrp2(train_df):
    asp_df = train_df.aspects.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects' , 'polarities'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    polarity_df = train_df.polarities.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects' , 'polarities'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    train_df_ungrp = pd.merge(asp_df , polarity_df['value'] , left_index = True , right_index = True ,suffixes=('_aspects' , '_polarities'))
    train_df_ungrp.rename(columns={'value_aspects' : 'aspects' , 'value_polarities':'polarities'} , inplace=True)

    return train_df_ungrp
train_aspects_ungrp = ungrp2(train_aspects)

In [36]:
train_aspects_ungrp.head(2)

Unnamed: 0,text,aspects,polarities
0,"The food was very good, a great deal, and the place its self was great.",AMBIENCE,positive
1,Terrible would be a compliment!,RESTAURANT,negative


We can see for the given text , we have been able to predict only 2 out 5 aspects correctly. We will filter out the text X aspects combination which our Multi Label Aspect is not able to predict correctly.

In [37]:
val_aspects_ungrp[val_aspects_ungrp['text']=='Everything was wonderful; food, drinks, staff, mileau.']

Unnamed: 0,text,aspects,polarities,aspects_pred,ind
20,"Everything was wonderful; food, drinks, staff, mileau.",AMBIENCE,positive,"[FOOD, SERVICE]",0
21,"Everything was wonderful; food, drinks, staff, mileau.",SERVICE,positive,"[FOOD, SERVICE]",1
22,"Everything was wonderful; food, drinks, staff, mileau.",FOOD,positive,"[FOOD, SERVICE]",1
23,"Everything was wonderful; food, drinks, staff, mileau.",DRINKS,positive,"[FOOD, SERVICE]",0
24,"Everything was wonderful; food, drinks, staff, mileau.",RESTAURANT,positive,"[FOOD, SERVICE]",0


In [38]:
val_aspects_ungrp2 = val_aspects_ungrp[val_aspects_ungrp['ind']==1]
du_aspects_ungrp2 = du_aspects_ungrp[du_aspects_ungrp['ind']==1]
sp_aspects_ungrp2 = sp_aspects_ungrp[sp_aspects_ungrp['ind']==1]

In [39]:
train_aspects_ungrp.to_csv('../data/Train_english_restaurants_ungrp.csv' , index = False)
val_aspects_ungrp2[['text', 'aspects', 'polarities']].to_csv('../data/Valid_english_restaurants_ungrp.csv' , index = False)
du_aspects_ungrp2[['text', 'aspects', 'polarities']].to_csv('../data/Dutch_restaurants_ungrp.csv' , index = False)
sp_aspects_ungrp2[['text', 'aspects', 'polarities']].to_csv('../data/Spanish_english_restaurants_ungrp.csv' , index = False)