In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.porter import PorterStemmer

In [2]:
train = pd.read_csv("http://www.i3s.unice.fr/~riveill/dataset/Amazon_Unlocked_Mobile/train.csv.gz")
val = pd.read_csv("http://www.i3s.unice.fr/~riveill/dataset/Amazon_Unlocked_Mobile/val.csv.gz")
test = pd.read_csv("http://www.i3s.unice.fr/~riveill/dataset/Amazon_Unlocked_Mobile/test.csv.gz")

In [3]:
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)
print("Val shape : ",val.shape)

Train shape :  (5000, 6)
Test shape :  (1000, 6)
Val shape :  (1000, 6)


In [4]:
# Construct X_train and y_train
X_train = train['Reviews'].fillna("")
X_train = np.array(train['Reviews'].fillna("")).reshape(-1,1)
y_train = train['Rating']
y_train = np.array(train['Rating']).reshape(-1,1)
X_train.shape, y_train.shape

# Construct X_test and y_test
X_test = test['Reviews'].fillna("")
X_test = np.array(test['Reviews']).reshape(-1,1)
y_test = test['Rating']
y_test = np.array(test['Rating']).reshape(-1,1)
X_test.shape, y_test.shape

# Construct X_val and y_val
X_val = val['Reviews'].fillna("")
X_val = np.array(val['Reviews']).reshape(-1,1)
y_val = val['Rating']
y_val = np.array(val['Rating']).reshape(-1,1)
X_val.shape, y_val.shape

((1000, 1), (1000, 1))

In [5]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
y_train_encoded = ohe.fit_transform(y_train)
y_val_encoded = ohe.transform(y_val)
y_test_encoded = ohe.transform(y_test)



In [6]:
#nltk.download('stopwords')
stops = set(stopwords.words('english'))

In [7]:
def del_stops(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stops:
            filtered_sentence.append(w)
    result=' '.join(filtered_sentence)
    return result

In [8]:
# Process the data
import unicodedata

def step1(sent):
    # sent = on sentence in a language
    def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    sent = unicode_to_ascii(sent.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    sent = re.sub(r"([?.!,¿])", r" \1 ", sent)
    sent = re.sub(r'[" "]+', " ", sent)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sent = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sent)
    
    #removing emojis and non latin symbols (chinese, russian etc...)
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    sent= re.sub(emoj, '', sent)
    

    return '<start> ' + sent.strip() + ' <end>' # and start and stop tag

In [9]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
import re
#Those are the cleaning functions we already used in previous machine learning labs

def clean_text(text):
    text=text.lower() #lowercase the text
    #text=re.sub(r'[^\w\s]', '', text) #remove punctuation  Voir si besoin de ça
    text=del_stops(text) #delete stop words
    text=lemmatizer.lemmatize(text)
    text=step1(text)
    return text

In [10]:
X_train_p = np.array([clean_text(r) for r in X_train.flatten()])
X_test_p = np.array([clean_text(r) for r in X_test.flatten()])
X_val_p = np.array([clean_text(r) for r in X_val.flatten().astype(str)]) #Add of astype(str) because one of the element was a float and it the clean_text(r)

In [11]:
# number of vocab to keep
max_vocab = 20000
# length of sequence that will generate
max_len = 15

tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X_train_p)

In [12]:
sequences = tokenizer.texts_to_sequences(X_train_p)
word_index = tokenizer.word_index
len(word_index) #number of words tokenized

8444

In [13]:
def babysit(history):
    keys = [key for key in history.keys() if key[:4] != "val_"]
    fig, ax = plt.subplots(nrows=1, ncols=len(keys), figsize=(18, 5))
    for i, key in enumerate(keys):
        ax[i].plot(history[key], label=key)
        if "val_"+key in history.keys():
            ax[i].plot(history["val_"+key], label="val_"+key)
        ax[i].legend()
        ax[i].set_title(key)
    plt.show()

In [14]:
BATCH_SIZE = 32
embedding_dim = 50
latent_dim = 64
dropout=0.3
epochs=1000
patience=5
attention_func = 'MLP' # others : 'dot', 'bilinear', 'MLP'
max_len=200

In [15]:
def voc(lang):
    # a list of sentences in the same language
    lengths = [len(txt.split()) for txt in lang]
    vocab = set([w for txt in lang for w in txt.split()])  

    return max(lengths), list(vocab), len(vocab)+2 # for padding and OOV
max_length_reviews, vocab_reviews, vocab_size_reviews = voc(X_train_p)

In [16]:
X_train_p[0:2]

array(['<start> love ! ! ! absolutely love ! ! <end>',
       '<start> love blu phones ! second one year anything wrong blu studio . hd . opted upgrade phone slightly smaller screen better battery life . price phones go wrong . . setup breeze . customization easy . colors vibrant . phone light even rugged case . battery life better phone ever had . screen responsive touch unlike phones . i issue volume music playback reviewers noted . great phone little price . everyone give blu try . <end>'],
      dtype='<U3643')

In [17]:
def build_rnn_model():
    sequences = layers.Input(shape=(100,))
    embedded = layers.Embedding(vocab_size_reviews, 64)(sequences)
    x = layers.GRU(128, return_sequences=True)(embedded)
    x = layers.GRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
rnn_model = build_rnn_model()

In [18]:
rnn_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 64)           540928    
                                                                 
 gru (GRU)                   (None, 100, 128)          74496     
                                                                 
 gru_1 (GRU)                 (None, 128)               99072     
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 100)               3300      
                                                                 
 dense_2 (Dense)             (None, 1)                 101   

In [39]:
# earlyStopping=EarlyStopping(patience=3,monitor='binary_accuracy',min_delta=0.0001,verbose=1)
# callbackslist=[earlyStopping]
# history = rnn_model.fit(X_train_p,y_train,batch_size=128,epochs=2,verbose=1,callbacks=callbackslist, validation_data=(X_val_p,y_val))

In [43]:
nb_classes = y_train.shape[1]
vocab_size = 10 ** 4  # Maximum vocab size -- adjust with the size of the vocabulary
embedding_size = 20  # Embedding size (usually <= 300)
recurrent_size = 64  # Recurrent size
hidden_size = recurrent_size // 4  # Hidden layer
dropout_rate = 0.2  # Dropout rate for regularization (usually between 0.1 and 0.25)
max_len = 150  # Sequence length to pad the outputs to (deduced from the length distribution study)
learning_rate = 0.0075

In [44]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, TimeDistributed, Activation

# Input layer
inputs = Input(shape=(None,))

# Embedding layer
embedded = Embedding(input_dim=vocab_size_reviews, output_dim=embedding_dim, input_length=max_len)(inputs)

# GRU layer with attention
gru, state_h = GRU(units=hidden_size, return_state=True)(embedded)

# Attention mechanism
attention = Dense(1, activation='tanh')(state_h)
attention = Activation('softmax')(attention)
context = keras.layers.dot([attention, gru], axes=-1)

# Concatenation of context vector and GRU output
merged = Concatenate()([context, state_h])

# Output layer
output = Dense(1, activation='sigmoid')(merged)

# Model definition
model = keras.Model(inputs=inputs, outputs=output)

# Teacher forcing implementation
def generate_sequence(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        yhat = model.predict(encoded, verbose=0)
        yhat = np.argmax(yhat)
        word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                break
        in_text += ' ' + word
    return in_text


ValueError: Incompatible input shapes: axis values 1 (at axis 1) != 16 (at axis 1). Full input shapes: (None, 1), (None, 16)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
