In [1]:
# nostri import 
import random
import pickle

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text #necessaria per hub.load
from sklearn.preprocessing import OneHotEncoder

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [39]:
df = pd.concat([pd.read_csv('../amazon_sinonimi.csv'), pd.read_csv('../reducedReviews.csv')], ignore_index=True)
df = df.sample(frac=1, random_state=RANDOM_SEED)

data_sinonimi = pd.read_csv('../amazon_sinonimi.csv')
data_embedding = pd.read_csv('../amazon_embedding.csv')

In [4]:
df['review'] = df['Summary']+df['Text']
del df['Summary']
del df['Text']
df.review.fillna("",inplace = True)

In [5]:
df["review_type"] = df["Score"].apply(lambda x: "negative" if x < 4 else "positive")

In [6]:
print(df.shape)
print(data_sinonimi.shape)
print(data_embedding.shape)

(326244, 3)
(152598, 4)
(149967, 4)


In [7]:
positive_reviews = df[df.review_type == "positive"]
negative_reviews = df[df.review_type == "negative"]

positive_df = positive_reviews.sample(n=min(len(negative_reviews), len(positive_reviews)), random_state=RANDOM_SEED)
negative_df = negative_reviews.sample(n=min(len(negative_reviews), len(positive_reviews)), random_state=RANDOM_SEED)

review_df = positive_df.append(negative_df).reset_index(drop=True)

In [8]:
type_one_hot = OneHotEncoder(sparse=False).fit_transform(
  review_df.review_type.to_numpy().reshape(-1, 1))

In [9]:
train_reviews, test_reviews, y_train, y_test =\
  train_test_split(
    review_df.review,
    type_one_hot,
    test_size=.3,
    random_state=RANDOM_SEED
  )

In [20]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [21]:
model = keras.Sequential()

model.add(keras.layers.Input(shape=(1,), dtype=tf.string))
model.add(keras.layers.Lambda(lambda x: tf.squeeze(tf.cast(x, tf.string))))
model.add(hub.KerasLayer(handle=embed,output_shape=512)) # pre trained Convolutional Neural Net. 
model.add(keras.layers.Dense(units=256, activation='relu')) 
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda (Lambda)              None                      0         
_________________________________________________________________
keras_layer (KerasLayer)     (None, 512)               85213184  
_________________________________________________________________
dense (Dense)                (None, 256)               131328    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 2

In [22]:
model.load_weights('model.h5')

In [40]:
df = pd.read_csv('../reducedReviews.csv')
df['review'] = df['Summary']+df['Text']
del df['Summary']
del df['Text']
df.review.fillna("",inplace = True)

data_sinonimi['review'] = data_sinonimi['Summary']+data_sinonimi['Text']
del data_sinonimi['Summary']
del data_sinonimi['Text']
data_sinonimi.review.fillna("",inplace = True)

data_embedding['review'] = data_embedding['Summary']+data_embedding['Text']
del data_embedding['Summary']
del data_embedding['Text']
data_embedding.review.fillna("",inplace = True)

In [44]:
df_one_hot = OneHotEncoder(sparse=False).fit_transform(
  df.review_type.to_numpy().reshape(-1, 1))
_, df_test_reviews, _, df_y_test = train_test_split(df.review,
                                                    df_one_hot,
                                                    test_size=.3,
                                                    random_state=RANDOM_SEED)

sinonimi_one_hot = OneHotEncoder(sparse=False).fit_transform(
  data_sinonimi.review_type.to_numpy().reshape(-1, 1))

embedding_one_hot = OneHotEncoder(sparse=False).fit_transform(
  data_embedding.review_type.to_numpy().reshape(-1, 1))

## Eval test relativo al training set usato 

In [None]:
result = model.evaluate(test_reviews, y_test)

   7/3059 [..............................] - ETA: 2:41:17 - loss: 0.3255 - accuracy: 0.8728

In [32]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.8639228940010071


## Eval test relativo al training set originale

In [54]:
result = model.evaluate(df_test_reviews, df_y_test)



In [55]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.9226974248886108


## Eval test relativo a tutti i sinonimi

In [47]:
result = model.evaluate(data_sinonimi.review, sinonimi_one_hot)



In [49]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.9007391929626465


## Eval test relativo a tutti gli embedding

In [None]:
result = model.evaluate(data_embedding.review, embedding_one_hot)

In [53]:
print(f'Accuracy: {result[1]}')

Accuracy: 0.9147012233734131
