In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tensorflow-text

import tensorflow_text as text
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 3.5 MB/s 
Collecting tensorflow<2.12,>=2.11.0
  Downloading tensorflow-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 19 kB/s 
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 55.2 MB/s 
Collecting flatbuffers>=2.0
  Downloading flatbuffers-22.12.6-py2.py3-none-any.whl (26 kB)
Collecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 70.5 MB/s 
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)


In [None]:
df = pd.read_csv('/content/drive/MyDrive/data_for_colab/tripadvisor_hotel_reviews.csv')

df = df[df.Rating != 3]

df['is_positive'] = (df['Rating'] >= 4).astype(int)

In [None]:
df.drop(columns=['Rating'], inplace=True)

In [None]:
df.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
df

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,nice hotel expensive parking got good deal sta...,1
1,ok nothing special charge diamond member hilto...,0
3,"unique, great stay, wonderful time hotel monac...",1
4,"great stay great stay, went seahawk game aweso...",1
5,love monaco staff husband stayed hotel crazy w...,1
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",1
20487,great location price view hotel great quick pl...,1
20488,"ok just looks nice modern outside, desk staff ...",0
20489,hotel theft ruined vacation hotel opened sept ...,0


In [None]:
df['LABEL_COLUMN'].value_counts()

1    15093
0     3214
Name: LABEL_COLUMN, dtype: int64

In [None]:
df_positive = df[df['LABEL_COLUMN']==1]

In [None]:
df_negative = df[df['LABEL_COLUMN']==0]

In [None]:
# Тестовая выборка

In [None]:
n_test = (df_negative.shape[0] // 4) * 3 # в оригинале (df_negative.shape[0] // 4) * 3
df_negative_test = df_negative.tail(n_test)
n_test = (df_positive.shape[0] // 20) * 3 # в оригинале df_positive.shape[0] // 20) * 3
df_positive_test = df_positive.tail(n_test)

In [None]:
df_positive_test

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
17565,wonderful hotel wonderful town unbeknownst man...,1
17566,wonderful wonderful wonderful just returned ye...,1
17567,"affinia 50 you'all, stayed affinia 50 week, li...",1
17568,just right good location walking distance bloo...,1
17569,loved place want thank outstanding service goi...,1
...,...,...
20480,great play stay stay loyal inn package deal ha...,1
20482,great choice wife chose best western quite bit...,1
20483,good bed clean convenient just night happy sta...,1
20486,"best kept secret 3rd time staying charm, not 5...",1


In [None]:
df_balanced_test = pd.concat([df_negative_test, df_positive_test])

In [None]:
df_balanced_test.sample(10)

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
17827,gaylord october 2007 overall stay gaylord suit...,1
9419,crap breakfast looked reviews couple hotels am...,0
6595,terrible place stay family miami fl plus coupl...,0
18811,"greaat time time time dominican republic, trav...",1
7814,great location air travelers planned reserved ...,0
14210,not quite par paradisus puerto rico wife child...,0
7999,horrible layne hotel lured hotel phone incredi...,0
5585,not best choice staff courteous location conve...,0
18618,okay issues husband just got 10 day stay majes...,0
10434,great location bad hotel does not live website...,0


In [None]:
df_balanced_test['LABEL_COLUMN'].value_counts()

0    2409
1    2262
Name: LABEL_COLUMN, dtype: int64

In [None]:
# Обучающая выборка

In [None]:
n_train = df_negative.shape[0] // 4 # в оригианле df_negative.shape[0] // 4
df_negative_train = df_negative.head(n_train)
n_train = df_positive.shape[0] // 20 # в оригинале df_positive.shape[0] // 20
df_positive_train = df_positive.head(n_train)

In [None]:
df_balanced_train = pd.concat([df_negative_train, df_positive_train])

In [None]:
df_balanced_train['LABEL_COLUMN'].value_counts()

0    803
1    754
Name: LABEL_COLUMN, dtype: int64

In [None]:
df_balanced_train.sample(10)

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
74,interesting comfortable base seattle really en...,1
843,good location adequate hotel embassy suite-bil...,1
994,just returned clift wife just returned wonderf...,1
851,nice hotel not quite ritz carlton standards rc...,1
2540,camp conquistador going caribbean presidents w...,0
2887,disgusting rooms disgusting dirty infested mos...,0
2861,kicked parents decided great sister resort fam...,0
4090,potential csme short husband went mexico city ...,0
2493,"beautiful hotel terrible houskeeping, just ret...",0
453,"old rude blah reason stayed jacuzzi suite, wis...",0


In [None]:
X_train = df_balanced_train['DATA_COLUMN'].squeeze()
y_train = df_balanced_train['LABEL_COLUMN'].squeeze()

In [None]:
X_test = df_balanced_test['DATA_COLUMN'].squeeze()
y_test = df_balanced_test['LABEL_COLUMN'].squeeze()

In [None]:
mobilebert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3") # same as bert



In [None]:
mobilebert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/mobilebert_en_uncased_L-24_H-128_B-512_A-4_F-4_OPT/1", trainable=True)

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = mobilebert_preprocess(text_input)
outputs = mobilebert_encoder(preprocessed_text)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)


In [None]:
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f91006d26a0>

In [None]:
y_predicted = model.predict(X_test)
y_predicted



array([[0.05955774],
       [0.01809953],
       [0.00279263],
       ...,
       [0.03209257],
       [0.12130452],
       [0.99999607]], dtype=float32)

In [None]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]])

In [None]:
y_test

4133     0
4134     0
4135     0
4136     0
4140     0
        ..
20480    1
20482    1
20483    1
20486    1
20487    1
Name: LABEL_COLUMN, Length: 4671, dtype: int64

In [None]:
accuracy_score(y_test, y_predicted)

0.7679297794904731

In [None]:
precision_score(y_test, y_predicted)


0.8880105401844532

In [None]:
recall_score(y_test, y_predicted)

0.5959328028293546

In [None]:
f1_score(y_test, y_predicted)

0.7132275132275132

In [None]:
df_results_for_reviews_on_hotels = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1_score'])

In [None]:
df_results_for_reviews_on_hotels.loc['reviews on hotels', 'accuracy'] = accuracy_score(y_test, y_predicted)
df_results_for_reviews_on_hotels.loc['reviews on hotels', 'precision'] = precision_score(y_test, y_predicted)
df_results_for_reviews_on_hotels.loc['reviews on hotels', 'recall'] = recall_score(y_test, y_predicted)
df_results_for_reviews_on_hotels.loc['reviews on hotels', 'f1_score'] =  f1_score(y_test, y_predicted)


In [None]:
saved_model_path = '/content/drive/MyDrive/data_for_colab/mobilebert_trained_on_hotel_reviews_26_december'

In [None]:
print(saved_model_path)

/content/drive/MyDrive/data_for_colab/mobilebert_trained_on_hotel_reviews_26_december


In [None]:
model.save(saved_model_path, include_optimizer=True) 



In [None]:
df_results_for_reviews_on_hotels


Unnamed: 0,accuracy,precision,recall,f1_score
reviews on hotels,0.76793,0.888011,0.595933,0.713228
