# Imports


In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.preprocessing import sequence
from tensorflow.python.keras import Sequential
from keras.models import load_model
from tensorflow.python.keras.layers import Dense, Embedding, GlobalAveragePooling1D

import pickle

##from Helpers_NN import add_sum_suffix, text_cleanup, reverse_encode, add_one_argmax_score, conf_matrix

In [63]:
df = pd.read_csv('/content/neural_network_data.csv')

In [65]:
df['rating'] = df['rating'].fillna((df['rating'].mean()))

In [66]:
df['rating'] = df['rating'].astype(np.int64)

In [68]:
df['review'] = df['review'].astype(str)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35563 entries, 0 to 35562
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  35563 non-null  int64 
 1   review  35563 non-null  object
dtypes: int64(1), object(1)
memory usage: 555.8+ KB


# One Hot Encoding Scores Column

In [70]:
enc = OneHotEncoder(handle_unknown='ignore')

enc_df = pd.DataFrame(enc.fit_transform(df[['rating']]).toarray())

df = df.join(enc_df)
df.head()

Unnamed: 0,rating,review,0,1,2,3,4
0,1,Its hard to find words that can describe this ...,1.0,0.0,0.0,0.0,0.0
1,1,"Absolutely terrible. Cracked ceiling, tiny roo...",1.0,0.0,0.0,0.0,0.0
2,1,Very disappointed firstly i checked in online ...,1.0,0.0,0.0,0.0,0.0
3,1,First of all we arrived wanting to park at the...,1.0,0.0,0.0,0.0,0.0
4,1,"Hello, I currently staying @ Hilton Metropole...",1.0,0.0,0.0,0.0,0.0


In [71]:
df = df.rename(columns={0: "rating_1", 1:'rating_2',2:'rating_3',3:'rating_4',4:'rating_5'})

In [72]:
df.head()

Unnamed: 0,rating,review,rating_1,rating_2,rating_3,rating_4,rating_5
0,1,Its hard to find words that can describe this ...,1.0,0.0,0.0,0.0,0.0
1,1,"Absolutely terrible. Cracked ceiling, tiny roo...",1.0,0.0,0.0,0.0,0.0
2,1,Very disappointed firstly i checked in online ...,1.0,0.0,0.0,0.0,0.0
3,1,First of all we arrived wanting to park at the...,1.0,0.0,0.0,0.0,0.0
4,1,"Hello, I currently staying @ Hilton Metropole...",1.0,0.0,0.0,0.0,0.0


# Train Test Split

In [73]:
x_train, x_test, y_train, y_test = train_test_split(df[['review']], 
                                                    df[['rating_1','rating_2','rating_3','rating_4','rating_5']], 
                                                    test_size=.2, 
                                                    random_state=42)

In [34]:
y_test.to_csv('/content/y_test_neural.csv',index=False)

In [37]:
x_test.to_csv('/content/x_test_neural.csv')

In [38]:
y_train.to_csv('/content/y_train_neural.csv')
x_train.to_csv('/content/x_train_neural.csv')

# Add Suffix to the Review Summary to Distinguish the Difference

In [74]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')


def add_sum_suffix(text):
    
    token_list = tokenizer.tokenize(text.lower())
    new_text = ''
    for word in token_list:
        word = word + '_sum'
        new_text += word + ' '
        
    return new_text

In [76]:
x_train['review_suffixed'] = x_train['review'].apply(add_sum_suffix)
x_test['review_suffixed'] = x_test['review'].apply(add_sum_suffix)

In [77]:
x_train.head()

Unnamed: 0,review,review_suffixed
4851,Hotel is what it is. Clean and functional and ...,hotel_sum is_sum what_sum it_sum is_sum clean_...
1087,Just checked in for a 4 day trip to London (Bu...,just_sum checked_sum in_sum for_sum a_sum day_...
5865,We don’t usually report on major chain hotels ...,we_sum don_sum t_sum usually_sum report_sum on...
29083,money grabbing charge credit card checkin chec...,money_sum grabbing_sum charge_sum credit_sum c...
3954,"OK hotel, but with the Hilton brand you would ...",ok_sum hotel_sum but_sum with_sum the_sum hilt...


#  Removing Punctuation and Tokenizing Review Column

In [78]:
def text_cleanup(text):
    
    token_list = tokenizer.tokenize(text.lower())
    new_text = ''
    for word in token_list:
        new_text += word + ' '
        
    return new_text

In [79]:
x_train['review_cleaned'] = x_train['review'].apply(text_cleanup)
x_test['review_cleaned'] = x_test['review'].apply(text_cleanup)

In [80]:
x_train.head()

Unnamed: 0,review,review_suffixed,review_cleaned
4851,Hotel is what it is. Clean and functional and ...,hotel_sum is_sum what_sum it_sum is_sum clean_...,hotel is what it is clean and functional and f...
1087,Just checked in for a 4 day trip to London (Bu...,just_sum checked_sum in_sum for_sum a_sum day_...,just checked in for a day trip to london busin...
5865,We don’t usually report on major chain hotels ...,we_sum don_sum t_sum usually_sum report_sum on...,we don t usually report on major chain hotels ...
29083,money grabbing charge credit card checkin chec...,money_sum grabbing_sum charge_sum credit_sum c...,money grabbing charge credit card checkin chec...
3954,"OK hotel, but with the Hilton brand you would ...",ok_sum hotel_sum but_sum with_sum the_sum hilt...,ok hotel but with the hilton brand you would e...


# Creating a Dictionary With Words That Appear in Reviews and an Index

In [81]:
flat_review = " ".join(x_train['review_cleaned'].values)
flat_review = flat_review.lower().split()

In [82]:
len(flat_review)

2985169

In [86]:
flat_review = " ".join(x_train['review_suffixed'].values)
flat_review = flat_review.lower().split()

In [88]:
unique_list = []
for word in flat_review:
    if word not in unique_list:
        unique_list.append(word)

In [None]:
len(unique_list)

In [None]:
word_index_dict = {}
for i in range(0,len(unique_list)):
    word_index_dict[unique_list[i]] = i

In [None]:
word_index_dict = {k:(v+4) for k,v in word_index_dict.items()}
word_index_dict['<PAD>'] = 0
word_index_dict['<START>'] = 1
word_index_dict['<UNK>'] = 2
word_index_dict['<UNUSED>'] = 3

# Indexing Words in Reviews Using Dictionary

In [None]:
def index_review_words(text):
    review_word_list = []
    for word in text.lower().split():
        if word in word_index_dict.keys():
            review_word_list.append(word_index_dict[word])
        else:
            review_word_list.append(word_index_dict['<UNK>'])

    return review_word_list 

In [None]:
x_train['preprocessed_review'] = x_train['review_cleaned'].apply(index_review_words)
x_test['preprocessed_review'] = x_test['review_cleaned'].apply(index_review_words)

In [None]:
x_train['preprocessed_review_summary'] = x_train['review_summary_suffixed'].apply(index_review_words)
x_test['preprocessed_review_summary'] = x_test['review_summary_suffixed'].apply(index_review_words)

In [None]:
x_train.head()

# 5.8 Combining Indexed Review Summary and Indexed Review Into a Single Column Called All Preprocessed Review