In [1]:
import sys
import re
import os
from pathlib import Path
from collections import namedtuple
import numpy as np
import pandas as pd
import pickle
from collections import Counter, defaultdict
import ast
pd.set_option('display.max_colwidth' , -1)

#### Working with dataset  with correct Aspect preditions
In order to predict sentiment we will input Text X aspect combination in our 3 layer neural network
We will create sentence embedding using LASER encoder for Text input, for aspect category  we will use word2vec embedding

In [2]:
train_aspects_ungrp = pd.read_csv('../data/Train_english_restaurants_ungrp.csv')
val_aspects_ungrp2 = pd.read_csv('../data/Valid_english_restaurants_ungrp.csv')
du_aspects_ungrp2  = pd.read_csv('../data/Dutch_restaurants_ungrp.csv')
sp_aspects_ungrp2  = pd.read_csv('../data/Spanish_english_restaurants_ungrp.csv')

In [3]:
train_aspects_ungrp[['text']].to_csv('../data/processed/en_resturant.csv' , header = None , index = None , mode = 'w')
val_aspects_ungrp2[['text']].to_csv('../data/processed/en_val.csv' , header = None , index = None , mode = 'w')
du_aspects_ungrp2[['text']].to_csv('../data/processed/nl_resturant.csv' , header = None , index = None , mode = 'w')
sp_aspects_ungrp2[['text']].to_csv('../data/processed/es_resturant.csv' , header = None , index = None , mode = 'w')

#### Create sentence embeddings

In [4]:
LASER_PATH = ".."
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

DATA_PATH = Path("../data/processed/")
CACHE_PATH = Path("cache2/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

os.environ["LASER"] = LASER_PATH 
SPACE_NORMALIZER = re.compile("\s+")
Batch = namedtuple('Batch', 'srcs tokens lengths')

In [5]:
from indexing import IndexLoad, IndexTextOpen, IndexTextQuery, IndexSearchKNN, IndexCreate, IndexSearchMultiple
from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply

In [7]:
encoder = SentenceEncoder(
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"),
    max_sentences=None,
    max_tokens=10000,
    cpu=False)
bpe_codes = str(MODEL_PATH / "93langs.fcodes")

for lang in ("en" ,"nl", 'es'): 
    Token(
        str(DATA_PATH / f"{lang}_resturant.csv"), ##english_resturant.txt
        str(CACHE_PATH / f"{lang}_resturant.csv"),
        lang=lang,
        romanize=False,
        lower_case=True, gzip=False,
        verbose=True)
    BPEfastApply(
        str(CACHE_PATH / f"{lang}_resturant.csv"),
        str(CACHE_PATH / f"{lang}_resturant.bpe"),
        bpe_codes,
        verbose=True, over_write=True)
    EncodeFile(
        encoder,
        str(CACHE_PATH / f"{lang}_resturant.bpe"),
        str(CACHE_PATH / f"{lang}_resturant.enc"),
        verbose=True, over_write=True)    
    
    
Token(
    str(DATA_PATH / f"en_val.csv"), ##english_resturant.txt
    str(CACHE_PATH / f"en_val.csv"),
    lang=lang,
    romanize=False,
    lower_case=True, gzip=False,
    verbose=True)
BPEfastApply(
    str(CACHE_PATH / f"en_val.csv"),
    str(CACHE_PATH / f"en_val.bpe"),
    bpe_codes,
    verbose=True, over_write=True)
EncodeFile(
    encoder,
    str(CACHE_PATH / f"en_val.bpe"),
    str(CACHE_PATH / f"en_val.enc"),
    verbose=True, over_write=True)    

 - Tokenizer: en_resturant.csv in language en  
 - fast BPE: processing en_resturant.csv
 - Encoder: en_resturant.bpe to en_resturant.enc
 - Encoder: 1656 sentences in 0s
 - Tokenizer: nl_resturant.csv in language nl  
 - fast BPE: processing nl_resturant.csv
 - Encoder: nl_resturant.bpe to nl_resturant.enc
 - Encoder: 971 sentences in 0s
 - Tokenizer: es_resturant.csv in language es  
 - fast BPE: processing es_resturant.csv
 - Encoder: es_resturant.bpe to es_resturant.enc
 - Encoder: 1533 sentences in 0s
 - Tokenizer: en_val.csv in language es  
 - fast BPE: processing en_val.csv
 - Encoder: en_val.bpe to en_val.enc
 - Encoder: 289 sentences in 0s


In [8]:
train_en, index_tr_en = IndexCreate(
    str(CACHE_PATH / "en_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
val_en, index_val_en = IndexCreate(
    str(CACHE_PATH / "en_val.enc"), 'FlatL2', verbose=True, save_index=False)

data_du, index_du = IndexCreate(
    str(CACHE_PATH / "nl_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
data_spanish, index_spanish = IndexCreate(
    str(CACHE_PATH / "es_resturant.enc"), 'FlatL2', verbose=True, save_index=False)

 - embedding: cache2/en_resturant.enc 1656 examples of dim 1024
 - creating FAISS index
 - embedding: cache2/en_val.enc 289 examples of dim 1024
 - creating FAISS index
 - embedding: cache2/nl_resturant.enc 971 examples of dim 1024
 - creating FAISS index
 - embedding: cache2/es_resturant.enc 1533 examples of dim 1024
 - creating FAISS index


#### create word embeddings for aspect words

In [12]:
import  pickle
word2vec = pickle.load(open("/data/swati.tiwari/Kaggle/yelp/capability_absa/src/utils/word2vec_google.pkl", 'rb'))

word_embeddings ={}

word_embeddings['FOOD']  = word2vec.get_vector('FOOD')
word_embeddings['RESTAURANT']  = word2vec.get_vector('RESTAURANT')
word_embeddings['SERVICE']  = word2vec.get_vector('SERVICE')
word_embeddings['AMBIENCE']  = word2vec.get_vector('AMBIENCE')
word_embeddings['DRINKS']  = word2vec.get_vector('DRINKS')
word_embeddings['LOCATION']  = word2vec.get_vector('LOCATION')

#### Concatenate the sentence embedding and word embeddings 

In [13]:
val_aspects_ungrp2.reset_index(inplace=True , drop= True)
train_aspects_ungrp.reset_index(inplace=True , drop= True)
sp_aspects_ungrp2.reset_index(inplace=True , drop = True)
du_aspects_ungrp2.reset_index(inplace=True , drop = True)

In [14]:
ct_val_en= np.empty((0 , 1324))
for index , row in val_aspects_ungrp2.iterrows():

    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((val_en[index] , w2v ) , axis =0 ))
    ct_val_en = np.append(ct_val_en ,[res] , axis = 0 )


In [15]:
ct_tr_en= np.empty((0 , 1324))
for index , row in train_aspects_ungrp.iterrows():
    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((train_en[index] , w2v ) , axis =0 ))
    ct_tr_en = np.append(ct_tr_en ,[res] , axis = 0 )

print(ct_tr_en.shape)

(1656, 1324)


In [16]:
ct_du= np.empty((0 , 1324))
for index , row in du_aspects_ungrp2.iterrows():
    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((data_du[index] , w2v ) , axis =0 ))
    ct_du = np.append(ct_du ,[res] , axis = 0 )

print(ct_du.shape) ; print(du_aspects_ungrp2.shape)

(971, 1324)
(971, 3)


In [17]:
ct_spanish= np.empty((0 , 1324))
for index , row in sp_aspects_ungrp2.iterrows():
    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((data_spanish[index] , w2v ) , axis =0 ))
    ct_spanish = np.append(ct_spanish ,[res] , axis = 0 )

print(ct_spanish.shape) ; print(sp_aspects_ungrp2.shape)

(1533, 1324)
(1533, 3)


In [18]:
def change_target(x):
    if x=='positive':
        return 2
    elif x =='negative':
        return 1
    else:
        return 0 

train_aspects_ungrp['polarities'] = train_aspects_ungrp['polarities'].apply(lambda x: change_target(x))
val_aspects_ungrp2['polarities'] = val_aspects_ungrp2['polarities'].apply(lambda x: change_target(x))
sp_aspects_ungrp2['polarities'] = sp_aspects_ungrp2['polarities'].apply(lambda x: change_target(x))
du_aspects_ungrp2['polarities'] = du_aspects_ungrp2['polarities'].apply(lambda x: change_target(x))


##### Create polarities column as target. Standarize concatenated embedding dataset 

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer
tr_eng =  train_aspects_ungrp['polarities'].values
val_eng =val_aspects_ungrp2['polarities'].values  
y_du  = du_aspects_ungrp2['polarities'].values
y_spainish  = sp_aspects_ungrp2['polarities'].values
tr_eng.shape , val_eng.shape , y_du.shape , y_spainish.shape

((1656,), (289,), (971,), (1533,))

In [20]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler().fit(ct_tr_en)
train_std = std_scale.transform(ct_tr_en) 
val_std = std_scale.transform(ct_val_en)
dutch_std = std_scale.transform(ct_du)
spanish_std = std_scale.transform(ct_spanish)

##### Create dataset for pytorch based multi class classification model

In [21]:
import torch 
import torch
import torch.nn as nn

x_train,y_train,x_valid,y_valid , x_test , y_test  , x_test_sp , y_test_sp = map(torch.FloatTensor, (train_std,tr_eng,  val_std ,\
                                                                            val_eng, dutch_std,y_du, \
                                                                           spanish_std ,y_spainish ))
n,c = x_train.shape
y_train = y_train.type(torch.LongTensor)
y_valid = y_valid.type(torch.LongTensor)
y_test = y_test.type(torch.LongTensor)
y_test_sp = y_test_sp.type(torch.LongTensor)

print(y_train.shape , y_valid.shape , y_test.shape)
print(x_train.shape , x_valid.shape , x_test.shape)
batch_size = 64

torch.Size([1656]) torch.Size([289]) torch.Size([971])
torch.Size([1656, 1324]) torch.Size([289, 1324]) torch.Size([971, 1324])


In [22]:
class Model(nn.Module):
    def __init__(self , p):
        super().__init__()
        self.hidden = nn.Linear(1324, 512)
        self.hidden2 = nn.Linear(512 , 256)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.fc = nn.Linear(256, 3)

    def forward(self, x):
        x = self.activation(self.dropout(self.hidden(x)))
        x = self.fc(x)
        return x

In [23]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size , shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid )
valid_dl = DataLoader(valid_ds , batch_size= batch_size)

test_ds = TensorDataset(x_test , y_test)
test_dl = DataLoader(test_ds , batch_size=batch_size)

test_ds2 = TensorDataset(x_test_sp , y_test_sp)
test_dl2 = DataLoader(test_ds2 , batch_size=batch_size)

In [24]:
class WrappedDataLoader():
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func
        
    def __len__(self): return len(self.dl)
    
    def __iter__(self):
        batches = iter(self.dl)
        for b in batches: yield(self.func(*b))

In [25]:

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def preprocess(x,y): return x.to(dev),y.to(dev)

train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)
test_dl = WrappedDataLoader(test_dl , preprocess)
test_dl2 = WrappedDataLoader(test_dl2 , preprocess)

In [26]:


def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) 
    correct = max_preds.squeeze(1).eq(y)   
    return correct.sum() / torch.FloatTensor([y.shape[0]])

def f1_scorepy(preds , y):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    res = f1_score(y.data.cpu().numpy() , max_preds.data.cpu().numpy(), average='macro')  
    prec = precision_score(y.data.cpu().numpy() , max_preds.data.cpu().numpy(), average='macro')  
    rec = recall_score(y.data.cpu().numpy() , max_preds.data.cpu().numpy(), average='macro') 
    return  res , prec , rec


In [27]:
from sklearn.metrics import f1_score , recall_score , precision_score

In [28]:
def train_model(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0  
    epoch_f1 = 0    ; epoch_pr = 0 ; epoch_rec = 0
    model.train()
    ct = 0
    for x, y in iterator:
        optimizer.zero_grad()
        predictions = model(x)
        loss = criterion(predictions, y)
        acc = categorical_accuracy(predictions, y)
        f1 , pr , recall  = f1_scorepy(predictions , y) 
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1  ; epoch_pr += pr  ; epoch_rec += recall  
    return epoch_loss / len(iterator), epoch_acc / len(iterator) , epoch_f1/len(iterator), epoch_pr/len(iterator), epoch_rec/len(iterator)

In [29]:
def validate_model(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0 
    epoch_f1 = 0 ; epoch_pr = 0 ; epoch_rec = 0
    model.eval()
    with torch.no_grad():
        for x ,y  in iterator:

            predictions = model(x)#.squeeze(1)
            loss = criterion(predictions,y)
            acc = categorical_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            f1 , pr , recall  = f1_scorepy(predictions , y)
            epoch_f1 += f1  ; epoch_pr += pr  ; epoch_rec += recall  
        
    return epoch_loss / len(iterator), epoch_acc /len(iterator) , epoch_f1/len(iterator), epoch_pr/len(iterator), epoch_rec/len(iterator)

In [30]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

from torch import optim

In [31]:
best_valid_f1 = -float('inf')
loss_func = nn.CrossEntropyLoss()
loss_func = loss_func.to(dev)
drp = 0.5
model = Model(drp);
model.apply(init_weights)
model = model.to(dev)
optimizer = optim.Adam(model.parameters() , lr = 0.005, weight_decay=0.001) 
model = model.to(dev)

  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
N_EPOCHS = 7
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    train_loss , train_acc , train_f1 , train_precision , train_recall = train_model(model, train_dl, optimizer, loss_func)
    valid_loss , valid_acc , valid_f1 , valid_precision , valid_recall  = validate_model(model, valid_dl, loss_func)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        print('train data' , train_acc , train_f1 , train_precision , train_recall)
        print('valid data' , valid_acc ,  valid_f1 , valid_precision , valid_recall)


        if os.path.isfile('results/sentiment_classification_problem.pt'):
            os.remove('results/sentiment_classification_problem.pt') ; print('chk')

        torch.save(model.state_dict(), 'results/sentiment_classification_problem.pt')

RuntimeError: size mismatch, m1: [64 x 512], m2: [64 x 3] at /opt/conda/conda-bld/pytorch_1544174967633/work/aten/src/THC/generic/THCTensorMathBlas.cu:266

##### Calculate sentiment prediction for Dutch dataset

In [39]:
test_preds = np.array([])
true_label = np.array([])
with torch.no_grad():
    for x ,y  in test_dl2:
        predictions = model(x)#.squeeze(1)
        max_preds = predictions.argmax(dim = 1, keepdim = True) 
        preds = max_preds.data.cpu().numpy()
        test_preds =np.append(test_preds , preds)
       
        true_label = np.append( true_label ,y.data.cpu().numpy())

sp_aspects_ungrp2['polarities_pred']  = test_preds
sp_aspects_ungrp2.polarities_pred = sp_aspects_ungrp2.polarities_pred.astype(int)

du1  = sp_aspects_ungrp2.groupby('text').polarities.apply(lambda x: ' '.join(map (str , x))).reset_index()
du2  = sp_aspects_ungrp2.groupby('text').polarities_pred.apply(lambda x: ' '.join(map (str , x))).reset_index()

sp_sentiment = pd.merge(du1 , du2 , on = ['text'])
sp_sentiment['polarities'] =sp_sentiment['polarities'].apply(lambda x: x.split(' '))
sp_sentiment['polarities_pred'] =sp_sentiment['polarities_pred'].apply(lambda x: x.split(' '))

from sklearn.preprocessing import MultiLabelBinarizer
mlb  = MultiLabelBinarizer()
tr_eng = mlb.fit_transform(sp_sentiment.polarities)
val_eng = mlb.transform(sp_sentiment.polarities_pred)

print("F1 score",f1_score( tr_eng , val_eng  , average='macro' ))
print("Precision score",precision_score(tr_eng , val_eng  , average='macro' ))
print("Recall score",recall_score(tr_eng , val_eng  , average='macro' ))

##### Calculate sentiment prediction for Spanishdataset

In [41]:
test_preds = np.array([])
true_label = np.array([])
with torch.no_grad():
    for x ,y  in valid_dl:
        predictions = model(x)#.squeeze(1)
        max_preds = predictions.argmax(dim = 1, keepdim = True) 
        preds = max_preds.data.cpu().numpy()
        test_preds =np.append(test_preds , preds)
       
        true_label = np.append( true_label ,y.data.cpu().numpy())

val_aspects_ungrp2['polarities_pred']  = test_preds
val_aspects_ungrp2.polarities_pred = val_aspects_ungrp2.polarities_pred.astype(int)

du1  = val_aspects_ungrp2.groupby('text').polarities.apply(lambda x: ' '.join(map (str , x))).reset_index()
du2  = val_aspects_ungrp2.groupby('text').polarities_pred.apply(lambda x: ' '.join(map (str , x))).reset_index()

sp_sentiment = pd.merge(du1 , du2 , on = ['text'])
sp_sentiment['polarities'] =sp_sentiment['polarities'].apply(lambda x: x.split(' '))
sp_sentiment['polarities_pred'] =sp_sentiment['polarities_pred'].apply(lambda x: x.split(' '))

from sklearn.preprocessing import MultiLabelBinarizer
mlb  = MultiLabelBinarizer()
tr_eng = mlb.fit_transform(sp_sentiment.polarities)
val_eng = mlb.transform(sp_sentiment.polarities_pred)

print("F1 score",f1_score( tr_eng , val_eng  , average='macro' ))
print("Precision score",precision_score(tr_eng , val_eng  , average='macro' ))
print("Recall score",recall_score(tr_eng , val_eng  , average='macro' ))