<a href="https://colab.research.google.com/github/PyBeginner1/MLProjects/blob/main/DisnleylandReview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf

In [11]:
data = pd.read_csv('/content/DisneylandReviews.csv', encoding='latin-1')

In [12]:
data

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong
...,...,...,...,...,...,...
42651,1765031,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,Disneyland_Paris
42652,1659553,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,Disneyland_Paris
42653,1645894,5,missing,South Africa,My eleven year old daughter and myself went to...,Disneyland_Paris
42654,1618637,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",Disneyland_Paris


In [13]:
data.isna().sum()

Review_ID            0
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64

In [14]:
data.drop('Review_ID', axis = 1, inplace = True)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Rating             42656 non-null  int64 
 1   Year_Month         42656 non-null  object
 2   Reviewer_Location  42656 non-null  object
 3   Review_Text        42656 non-null  object
 4   Branch             42656 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.6+ MB


In [16]:
def get_sequences(texts, tokenizer, train=True, max_seq_length=None):
    sequences = tokenizer.texts_to_sequences(texts)
    
    if train == True:
        max_seq_length = np.max(list(map(len, sequences)))
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences

In [17]:
def preprocess_inputs(df):
  df = df.copy()

  X = df['Review_Text']
  Y = df['Rating']

  #train-test split
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.7, shuffle = True, random_state = 1)

  #tokenize
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(X_train)
  print('Vocab Length:' ,len(tokenizer.word_index) + 1)

  #Convert texts to sequences
  X_train = get_sequences(X_train, tokenizer, train=True)
  X_test = get_sequences(X_test, tokenizer, train=False, max_seq_length=X_train.shape[1])

  return X_train, X_test, Y_train, Y_test, tokenizer

In [18]:
X_train, X_test, Y_train, Y_test, t = preprocess_inputs(data)

Vocab Length: 37846


In [19]:
X_train

array([[ 12, 154, 159, ...,   0,   0,   0],
       [330,   3,  38, ...,   0,   0,   0],
       [  6, 168, 193, ...,   0,   0,   0],
       ...,
       [ 26,   7, 251, ...,   0,   0,   0],
       [ 12,  28, 989, ...,   0,   0,   0],
       [ 68,  23,  68, ...,   0,   0,   0]], dtype=int32)

In [20]:
len(t.word_index)

37845

In [21]:
X_train.shape

(29859, 3958)

In [22]:
inputs = tf.keras.Input(shape = (3958,))
x = tf.keras.layers.Embedding(
    input_dim = 37846,
    output_dim = 64
)(inputs)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation = 'relu')(x)
x = tf.keras.layers.Dense(128, activation = 'relu')(x)
outputs = tf.keras.layers.Dense(1, activation = 'linear')(x)

model = tf.keras.Model(inputs = inputs, outputs = outputs)

model.compile(
    optimizer = 'adam',
    loss = 'mse'
)

history = model.fit(
    X_train,
    Y_train,
    validation_split = 0.2,
    batch_size = 32,
    epochs = 100,
    callbacks = [
                 tf.keras.callbacks.EarlyStopping(
                     monitor = 'val_loss',
                     patience = 3,
                     restore_best_weights = True
                 )
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [23]:
inputs

<KerasTensor: shape=(None, 3958) dtype=float32 (created by layer 'input_3')>

In [24]:
x

<KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'dense_6')>

In [25]:
x

<KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'dense_6')>

In [31]:
pred = np.squeeze(model.predict(X_test))

rmse = np.sqrt(np.mean((Y_test - pred) ** 2))
print('RMSE: {:.2f}'.format(rmse))

RMSE: 0.71


In [32]:
r2_score = 1 - (np.sum((Y_test - pred) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2))

print('R2 Score: {:.2f}'.format(r2_score))

R2 Score: 0.55
