In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf

In [2]:
data = pd.read_csv('DisneylandReviews.csv', encoding='latin-1')

In [3]:
data

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong
...,...,...,...,...,...,...
42651,1765031,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,Disneyland_Paris
42652,1659553,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,Disneyland_Paris
42653,1645894,5,missing,South Africa,My eleven year old daughter and myself went to...,Disneyland_Paris
42654,1618637,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",Disneyland_Paris


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          42656 non-null  int64 
 1   Rating             42656 non-null  int64 
 2   Year_Month         42656 non-null  object
 3   Reviewer_Location  42656 non-null  object
 4   Review_Text        42656 non-null  object
 5   Branch             42656 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.0+ MB


Preprocessing

In [7]:
def get_sequences(texts, tokenizer, train=True, max_seq_length=None):
    sequences = tokenizer.texts_to_sequences(texts)
    
    if train == True:
        max_seq_length = np.max(list(map(len, sequences)))
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences

In [8]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Limit data to only the review and rating column
    y = df['Rating']
    X = df['Review_Text']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Fit tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    print("Vocab length:", len(tokenizer.word_index) + 1)
    
    # Convert texts to sequences
    X_train = get_sequences(X_train, tokenizer, train=True)
    X_test = get_sequences(X_test, tokenizer, train=False, max_seq_length=X_train.shape[1])
    
    return X_train, X_test, y_train, y_test, tokenizer

In [9]:
X_train, X_test, y_train, y_test, t = preprocess_inputs(data)

Vocab length: 37846


In [10]:
X_train

array([[ 12, 154, 159, ...,   0,   0,   0],
       [330,   3,  38, ...,   0,   0,   0],
       [  6, 168, 193, ...,   0,   0,   0],
       ...,
       [ 26,   7, 251, ...,   0,   0,   0],
       [ 12,  28, 989, ...,   0,   0,   0],
       [ 68,  23,  68, ...,   0,   0,   0]])

In [11]:
y_train

20780    5
791      3
19394    5
32755    5
38577    3
        ..
7813     3
32511    5
5192     5
12172    5
33003    5
Name: Rating, Length: 29859, dtype: int64

Training

In [12]:
# X_train.shape

(29859, 3958)

In [13]:
# inputs = tf.keras.Input(shape=(3958,))
# x = tf.keras.layers.Embedding(
#     input_dim=37846,
#     output_dim=64
# )(inputs)