In [1]:
# nostri import 
import random
import pickle

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import tensorflow_hub as hub
import tensorflow_text #necessaria per hub.load

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('../Reviews.csv')

data_sinonimi = pd.read_csv('../amazon_sinonimi.csv')
data_embedding = pd.read_csv('../amazon_embedding.csv')

In [3]:
df['review'] = df['Summary']+df['Text']
df.review.fillna("",inplace = True)

data_sinonimi['review'] = data_sinonimi['Summary']+data_sinonimi['Text']
data_sinonimi.review.fillna("",inplace = True)

data_embedding['review'] = data_embedding['Summary']+data_embedding['Text']
data_embedding.review.fillna("",inplace = True)

In [4]:
df["review_type"] = df["Score"].apply(lambda x: "negative" if x < 4 else "positive")

In [5]:
print(df.shape)
print(data_sinonimi.shape)
print(data_embedding.shape)

(568454, 12)
(152598, 5)
(149967, 5)


In [6]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [7]:
type_one_hot = OneHotEncoder(sparse=False).fit_transform(
  df.review_type.to_numpy().reshape(-1, 1)
)

In [8]:
_, test_reviews, _, y_test =\
  train_test_split(
    df.review,
    type_one_hot,
    test_size=.3,
    random_state=RANDOM_SEED
  )

In [9]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [10]:
model = keras.Sequential()

model.add(keras.layers.Input(shape=(1,), dtype=tf.string))
model.add(keras.layers.Lambda(lambda x: tf.squeeze(tf.cast(x, tf.string))))
model.add(hub.KerasLayer(handle=embed,output_shape=512)) # pre trained Convolutional Neural Net. 
model.add(keras.layers.Dense(units=256, activation='relu')) 
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda (Lambda)              None                      0         
_________________________________________________________________
keras_layer (KerasLayer)     (None, 512)               85213184  
_________________________________________________________________
dense (Dense)                (None, 256)               131328    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 2

In [11]:
model.load_weights('model.h5')

In [12]:
data_sinonimi_005 = pd.read_csv('../amazon_sinonimi005.csv')
data_sinonimi_005['review'] = data_sinonimi_005['Summary']+data_sinonimi_005['Text']
del data_sinonimi_005['Summary']
del data_sinonimi_005['Text']
data_sinonimi_005.review.fillna("",inplace = True)

In [13]:
df_one_hot = OneHotEncoder(sparse=False).fit_transform(
  df.review_type.to_numpy().reshape(-1, 1)
)

sinonimi_one_hot = OneHotEncoder(sparse=False).fit_transform(
  data_sinonimi.review_type.to_numpy().reshape(-1, 1)
)

sinonimi_005_one_hot = OneHotEncoder(sparse=False).fit_transform(
  data_sinonimi_005.review_type.to_numpy().reshape(-1, 1))

embedding_one_hot = OneHotEncoder(sparse=False).fit_transform(
  data_embedding.review_type.to_numpy().reshape(-1, 1)
)

## Eval test relativo al training set usato

In [None]:
result = model.evaluate(test_reviews, y_test)

  25/5330 [..............................] - ETA: 2:42:20 - loss: 0.2530 - accuracy: 0.9158

In [14]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.9212077260017395


## Eval test relativo a tutti i sinonimi

In [None]:
result = model.evaluate(data_sinonimi.review, sinonimi_one_hot)



In [16]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.8062229156494141


## Eval test relativo ai sinonimi con swap 0.05

In [None]:
result = model.evaluate(data_sinonimi_005.review, sinonimi_005_one_hot)

  72/5427 [..............................] - ETA: 3:29:12 - loss: 0.2788 - accuracy: 0.9129

In [15]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.9038503766059875


## Eval test relativo a tutti gli embedding

In [None]:
result = model.evaluate(data_embedding.review, embedding_one_hot)

   6/4687 [..............................] - ETA: 3:48:59 - loss: 0.3306 - accuracy: 0.9167

In [18]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.9071662425994873
