In [None]:
#Importing the required libraries for preprocessing and sequencing:
import numpy as np  
import pandas as pd 
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split


#Reading and accessing the dataset 
df = pd.read_csv("./data2017.csv")
df2 = pd.read_csv("./data2018.csv")
df3 = pd.read_csv("./data2019.csv")
df = df.loc[df['Review'] != 'Quotation forthcoming.']
df2 = df2.loc[df2['Review'] != 'Quotation forthcoming.']
df3 = df3.loc[df3['Review'] != 'Quotation forthcoming.']

#Returning the Numpy ndarray representing the values in the data: 
data = df.to_numpy()
data2 = df2.to_numpy()
data3 = df3.to_numpy()

#Concatenating all the data together from all three years:
data = np.concatenate((data, data2, data3), axis=0)


#Preparing the data for supervised processing:
data, results = data[:, 0], data[:, 1]
#Splitting the data into training and testing sets(important step in classification):
X_train, X_test, y_train, y_test = train_test_split(data, results, test_size=.2)
#Setting Ascii letters:
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
X_train = [''.join(filter(whitelist.__contains__, x)) for x in X_train]
X_test = [''.join(filter(whitelist.__contains__, x)) for x in X_test]
X_train = np.array([x.lower() for x in X_train])
X_test = np.array([x.lower() for x in X_test])

#Returning the single line of input for t:
t = Tokenizer()
t.fit_on_texts(X_train)
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)
vocab_size = len(t.word_index) + 1


#training the dataset
train_lengths = [len(x) for x in X_train]
test_lengths = [len(x) for x in X_test]
max_sentence_len = max(np.max(train_lengths), np.max(test_lengths))
X_train = sequence.pad_sequences(X_train, maxlen=max_sentence_len, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_sentence_len, padding='post')
X_train = np.asarray(X_train).astype('float32')
X_test = np.asarray(X_test).astype('float32')
y_train = np.asarray(y_train).astype('float32')/100
y_test = np.asarray(y_test).astype('float32')/100


#Building the model:
#1)First Layer
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 32, input_length=X_train.shape[1]))
#The model will take as input an integer matrix of size (batch, input_length), and the largest integer (i.e. word index) in the input
model.add(tf.keras.layers.Flatten())
#2)using the ReLU activation function along with dropout of 0.3:
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(.3))
#3)using the Sigmoid activation function along with dropout of 0.2:
model.add(tf.keras.layers.Dense(8, activation='sigmoid'))
model.add(tf.keras.layers.Dropout(.2))
model.add(tf.keras.layers.Dense(1, activation='relu'))
model.compile(loss='mean_squared_logarithmic_error')
#Generalizing the model to similar data to combat overfitting/underfitting:
model.fit(X_train, y_train, epochs=7) 
     
#Displaying the outcome of the evaluated result:
print(model.evaluate(X_test, y_test))

#Testing the model:
test = ['i hate this game and will not play it again the animations are bad and the storyline is boring'
        , 'this game is amazing the animations are so clean and wonderful incredible experience', ''
       , 'this is the best game i have ever played amazing animations incredible storyline super nice gameplay everything is so good']
test = t.texts_to_sequences(test)
test = sequence.pad_sequences(test, maxlen=max_sentence_len, padding='post')
#Displaying the Output:
print(model.predict(test))
print([X_train[0]])

