# Tripadvisor Baseline

#### Andreas Pirchner

In [0]:
import numpy as np
import pandas as pd
from itertools import product

from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import multilabel_confusion_matrix, classification_report

## Preprocessing

Load training and test data:

In [0]:
train = preprocess_tripad_annotated('/home/andreas/Documents/msc_info/sem_2/nlp/data/tripadvisor/')
test = preprocess_tripad_annotated('/home/andreas/Documents/msc_info/sem_2/nlp/data/tripadvisor/', False)

In [0]:
train.head()

Unnamed: 0,aspects,polarities,segment
0,[OTHER],[p],[CLS] LOVED THE HAMPTON INN SEAPORT!!!!!!!!!!!...
1,[NOTRELATED],[x],[CLS] Just returned from a 3 night stay. [SEP]
2,[OTHER],[p],[CLS] This is a FABULOUS hotel. [SEP]
3,[SERVICE],[p],"[CLS] The front desk staff, the doormen, the b..."
4,[ROOMS],[p],[CLS] The room was fabulous too. [SEP]


The shapes of the two sets:

In [0]:
train.shape

(2773, 3)

In [0]:
test.shape

(1272, 3)

Get BERT embeddings for train and test set:

In [0]:
embed = Embeddings()

In [0]:
train_embeddings = embed.get_embeddings(train.segment, all=False)
test_embeddings = embed.get_embeddings(test.segment, all=False)

In [0]:
train_embeddings[0].shape

torch.Size([26, 768])

In [0]:
train['embeddings'] = [e[0].numpy() for e in train_embeddings]
test['embeddings'] = [e[0].numpy() for e in test_embeddings]

## XGBoost Baseline

### Model for aspect

In [0]:
le_aspect = MultiLabelBinarizer()
le_aspect.fit(train.aspects)
le_aspect.classes_

array(['BUILDING', 'BUSINESS', 'CHECKIN', 'CLEANLINESS', 'FOOD',
       'LOCATION', 'NOTRELATED', 'OTHER', 'ROOMS', 'SERVICE', 'VALUE'],
      dtype=object)

In [0]:
y_train_aspect = le_aspect.transform(train.aspects)
y_test_aspect = le_aspect.transform(test.aspects)
y_train_aspect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [0]:
xgb_aspect = OneVsRestClassifier(XGBClassifier(objective='binary:logistic', n_estimators=500 , max_depth=3,
                                               learning_rate=0.05, n_jobs=24, verbosity=1,
                                               min_child_weight=20, scale_pos_weight=7))
xgb_aspect.fit(np.array(list(train.embeddings)), y_train_aspect)

OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bytree=1, gamma=0,
                                            learning_rate=0.05,
                                            max_delta_step=0, max_depth=3,
                                            min_child_weight=20, missing=None,
                                            n_estimators=500, n_jobs=24,
                                            nthread=None,
                                            objective='binary:logistic',
                                            random_state=0, reg_alpha=0,
                                            reg_lambda=1, scale_pos_weight=7,
                                            seed=None, silent=True, subsample=1,
                                            verbosity=1),
                    n_jobs=None)

In [0]:
print(classification_report(y_test_aspect, xgb_aspect.predict(np.array(list(test.embeddings))),
                            target_names=le_aspect.classes_))

              precision    recall  f1-score   support

    BUILDING       0.44      0.22      0.29        64
    BUSINESS       0.00      0.00      0.00         7
     CHECKIN       0.75      0.26      0.39        46
 CLEANLINESS       0.74      0.28      0.41        71
        FOOD       0.79      0.60      0.68       117
    LOCATION       0.69      0.53      0.60       149
  NOTRELATED       0.54      0.20      0.29       148
       OTHER       0.59      0.73      0.65       386
       ROOMS       0.68      0.73      0.70       220
     SERVICE       0.60      0.59      0.59       210
       VALUE       0.70      0.40      0.51        75

   micro avg       0.63      0.55      0.59      1493
   macro avg       0.59      0.41      0.46      1493
weighted avg       0.63      0.55      0.57      1493
 samples avg       0.55      0.57      0.54      1493



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Model for polarity

In [0]:
le_polarity = MultiLabelBinarizer()
le_polarity.fit(train.polarities)
le_polarity.classes_

array(['n', 'p', 'x'], dtype=object)

In [0]:
y_train_polarity = le_polarity.transform(train.polarities)
y_test_polarity = le_polarity.transform(test.polarities)

In [0]:
xgb_polarity = OneVsRestClassifier(XGBClassifier(objective='binary:logistic', n_estimators=500 , max_depth=3,
                                                 learning_rate=0.05, n_jobs=24, verbosity=1, min_child_weight=20,
                                                 scale_pos_weight=6))
xgb_polarity.fit(np.array(list(train.embeddings)), y_train_polarity)

OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bytree=1, gamma=0,
                                            learning_rate=0.05,
                                            max_delta_step=0, max_depth=4,
                                            min_child_weight=20, missing=None,
                                            n_estimators=500, n_jobs=24,
                                            nthread=None,
                                            objective='binary:logistic',
                                            random_state=0, reg_alpha=0,
                                            reg_lambda=1, scale_pos_weight=6,
                                            seed=None, silent=True, subsample=1,
                                            verbosity=1),
                    n_jobs=None)

In [0]:
print(classification_report(y_test_polarity, xgb_polarity.predict(np.array(list(test.embeddings))),
                            target_names=le_polarity.classes_))

              precision    recall  f1-score   support

           n       0.58      0.78      0.67       272
           p       0.77      0.91      0.83       797
           x       0.47      0.30      0.36       251

   micro avg       0.69      0.77      0.73      1320
   macro avg       0.61      0.66      0.62      1320
weighted avg       0.68      0.77      0.71      1320
 samples avg       0.72      0.77      0.73      1320



  'precision', 'predicted', average, warn_for)
