In [None]:
# To ensure results can be repeated, the same seed value should be used for all testing
seed_value= 2019

from numpy.random import seed
seed(seed_value)

# Additional seed value required to be set for tensorflow backend
from tensorflow import set_random_seed
set_random_seed(seed_value)

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

print("Seed values set to 2019.")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
train = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
print ("Train data imported")

In [None]:
print(train.shape)
print(train.columns.values)


In [None]:
print (train["review"][0])

In [None]:
print(train)

In [None]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup             

# Initialize the BeautifulSoup object on a single movie review     
example1 = BeautifulSoup(train["review"][0])  

# Print the raw review and then the output of get_text(), for 
# comparison
print (example1.get_text())

In [None]:
import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
print (letters_only)

In [None]:
lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()               # Split into words
print(words)

In [None]:
import nltk
#nltk.download()  # Download text data sets, including stop words

In [None]:
# Now we can use nltk to get a list of stop words:
from nltk.corpus import stopwords # Import the stop word list
print (stopwords.words("english"))

In [None]:
# This will allow you to view the list of English-language stop words. To remove stop words from our movie review, do:
# Remove stop words from "words"
words = [w for w in words if not w in stopwords.words("english")]
print (words)

In [None]:
# try to do stemming to group similar terms together
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [None]:
#create an object of class PorterStemmer
porter = PorterStemmer()
lancaster=LancasterStemmer()
#proide a word to be stemmed
print("Porter Stemmer")
print(porter.stem("cats"))
print(porter.stem("trouble"))
print(porter.stem("troubling"))
print(porter.stem("troubled"))
print("Lancaster Stemmer")
print(lancaster.stem("cats"))
print(lancaster.stem("trouble"))
print(lancaster.stem("troubling"))
print(lancaster.stem("troubled"))

# Cannot stem with a sentence
print(lancaster.stem("troubled trouble troubling"))

In [None]:
# Because stemming has to be done word by word, not the whole sentence
word_stem = []
for word in words:
    word_stem.append(porter.stem(word))
print(word_stem)

In [None]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 5.5 Try, stemming with Porter
    word_stem = []
    for word in meaningful_words:
        word_stem.append(porter.stem(word))
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))
    return( " ".join( word_stem ))

In [None]:
clean_review = review_to_words( train["review"][0] )
print (clean_review)

In [None]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

print ("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%5000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews ))
    clean_train_reviews.append(review_to_words(train["review"][i]))

In [None]:
print(clean_train_reviews[0])

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print ("Creating the bag of words...\n")

In [None]:
print (train_data_features.shape)

In [None]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print (vocab)

In [None]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (count, tag)

In [None]:
# Read the test data
test = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print (test.shape)
print(test)

In [None]:
# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

print ("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 5000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

print(test_data_features.shape)
print(test_data_features)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
#!pip install numpy==1.16.1
import numpy as np

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

print("finish import")

In [None]:
tf.__version__

In [None]:
tf.keras.__version__

In [None]:
# User defined parameters
batch_size_user = 128
embedding_size = 32
num_words = 5000
max_tokens = 5000
epoch_user = 10
print("User defined parameters set")
print(batch_size_user)
print(embedding_size)
print(num_words)
print(max_tokens)
print(epoch_user)

In [None]:
# create the model
model = 0
model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_tokens))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
X_train = train_data_features
y_train = train["sentiment"]
print("xy values set")

In [None]:
# To ensure results can be repeated, the same seed value should be used for all testing
seed_value= 2019

from numpy.random import seed
seed(seed_value)
# Additional seed value required to be set for tensorflow backend
from tensorflow import set_random_seed
set_random_seed(seed_value)

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

print("Seed values re-set to 2019.")

In [None]:
# Fit the model
model.fit(X_train, y_train, epochs=epoch_user, batch_size=batch_size_user, verbose=1)


In [None]:
# Use the random forest to make sentiment label predictions
#result = forest.predict(test_data_features)

y_test = model.predict(test_data_features)
print(y_test.shape)
print(sum(y_test))

In [None]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column

output = test
output = output.drop(columns=['review'])
output["sentiment"] = y_test
print(output)
#output = pd.DataFrame( data={"id":test["id"], "sentiment":y_test} )

In [None]:
# Use pandas to write the comma-separated output file
output.to_csv( "model1.csv", index=False, quoting=3 )
print("model1.csv printed")

In [None]:
# create the model
model2 = 0
model2 = Sequential()
model2.add(Embedding(num_words, embedding_size, input_length=max_tokens))
model2.add(Flatten())
model2.add(Dense(50, activation='relu'))
model2.add(Dense(50, activation='relu'))
model2.add(Dense(50, activation='relu'))
model2.add(Dense(50, activation='relu'))
model2.add(Dense(50, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

In [None]:
# To ensure results can be repeated, the same seed value should be used for all testing
seed_value= 2019

from numpy.random import seed
seed(seed_value)
# Additional seed value required to be set for tensorflow backend
from tensorflow import set_random_seed
set_random_seed(seed_value)

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

print("Seed values re-set to 2019.")

In [None]:

X_train = train_data_features
y_train = train["sentiment"]
print("xy values set")

# Fit the model
model2.fit(X_train, y_train, epochs=epoch_user, batch_size=batch_size_user, verbose=1)

# Use the random forest to make sentiment label predictions
#result = forest.predict(test_data_features)

y_test = model2.predict(test_data_features)
print(y_test.shape)
print(sum(y_test))

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column

output2 = test
output2 = output2.drop(columns=['review'])
output2["sentiment"] = y_test
print(output2)
#output = pd.DataFrame( data={"id":test["id"], "sentiment":y_test} )

# Use pandas to write the comma-separated output file
#output.to_csv( "model1.csv", index=False, quoting=3 )

In [None]:
# Use pandas to write the comma-separated output file
output2.to_csv( "model2.csv", index=False, quoting=3 )
print("model2.csv printed")

In [None]:
# create the model
model3 = 0
model3 = Sequential()
model3.add(Embedding(num_words, embedding_size, input_length=max_tokens))
model3.add(Flatten())
model3.add(Dropout(0.2, input_shape=(160000,)))
model3.add(Dense(50, activation='relu'))
model3.add(Dense(50, activation='relu'))
model3.add(Dense(50, activation='relu'))
model3.add(Dense(50, activation='relu'))
model3.add(Dense(50, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model3.summary())

In [None]:
# To ensure results can be repeated, the same seed value should be used for all testing
seed_value= 2019

from numpy.random import seed
seed(seed_value)
# Additional seed value required to be set for tensorflow backend
from tensorflow import set_random_seed
set_random_seed(seed_value)

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

print("Seed values re-set to 2019.")

In [None]:

X_train = train_data_features
y_train = train["sentiment"]
print("xy values set")

# Fit the model
model3.fit(X_train, y_train, epochs=epoch_user, batch_size=batch_size_user, verbose=1)

# Use the random forest to make sentiment label predictions
#result = forest.predict(test_data_features)

y_test = model3.predict(test_data_features)
print(y_test.shape)
print(sum(y_test))

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column

output3 = test
output3 = output3.drop(columns=['review'])
output3["sentiment"] = y_test
print(output3)
#output = pd.DataFrame( data={"id":test["id"], "sentiment":y_test} )

# Use pandas to write the comma-separated output file
#output.to_csv( "model1.csv", index=False, quoting=3 )

In [None]:
# Use pandas to write the comma-separated output file
output3.to_csv( "model3.csv", index=False, quoting=3 )
print("model3.csv printed")

In [None]:
# create the model
model4 = 0
model4 = Sequential()
model4.add(Embedding(num_words, embedding_size, input_length=max_tokens))
model4.add(Flatten())
model4.add(Dense(50, activation='relu'))
model4.add(Dense(50, activation='relu'))
model4.add(Dropout(0.2))
model4.add(Dense(50, activation='relu'))
model4.add(Dense(50, activation='relu'))
model4.add(Dropout(0.2))
model4.add(Dense(50, activation='relu'))
model4.add(Dense(1, activation='sigmoid'))
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model4.summary())

In [None]:
# To ensure results can be repeated, the same seed value should be used for all testing
seed_value= 2019

from numpy.random import seed
seed(seed_value)
# Additional seed value required to be set for tensorflow backend
from tensorflow import set_random_seed
set_random_seed(seed_value)

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

print("Seed values re-set to 2019.")

In [None]:

X_train = train_data_features
y_train = train["sentiment"]
print("xy values set")

# Fit the model
model4.fit(X_train, y_train, epochs=epoch_user, batch_size=batch_size_user, verbose=1)

# Use the random forest to make sentiment label predictions
#result = forest.predict(test_data_features)

y_test = model4.predict(test_data_features)
print(y_test.shape)
print(sum(y_test))

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column

output4 = test
output4 = output4.drop(columns=['review'])
output4["sentiment"] = y_test
print(output4)
#output = pd.DataFrame( data={"id":test["id"], "sentiment":y_test} )

# Use pandas to write the comma-separated output file
#output.to_csv( "model1.csv", index=False, quoting=3 )

In [None]:
# Use pandas to write the comma-separated output file
output4.to_csv( "model4.csv", index=False, quoting=3 )
print("model4.csv printed")

In [None]:
# LSTM layer expects inputs to have shape of (batch_size, timesteps, input_dim).
# In keras you need to pass (timesteps, input_dim) 

In [None]:
# create the model
model5 = 0
model5 = Sequential()
model5.add(Embedding(num_words, embedding_size, input_length=max_tokens))
#model5.add(Flatten())
#model5.add(Dropout(0.2))
#model5.add(GRU(units=64, input_shape=(125000000,1,1),return_sequences=True))
#model5.add(GRU(units=32, return_sequences=True))
#model5.add(GRU(units=16, return_sequences=True))
#model5.add(GRU(units=8, return_sequences=True))
model5.add(GRU(units=4))
model5.add(Dense(1, activation='sigmoid'))
model5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model5.summary())

In [None]:
# To ensure results can be repeated, the same seed value should be used for all testing
seed_value= 2019

from numpy.random import seed
seed(seed_value)
# Additional seed value required to be set for tensorflow backend
from tensorflow import set_random_seed
set_random_seed(seed_value)

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

print("Seed values re-set to 2019.")

In [None]:
#X_train = train_data_features
#y_train = train["sentiment"]
#print("xy values set")
#model5.fit(X_train, y_train, epochs=epoch_user, batch_size=batch_size_user, verbose=1)

#y_test = model5.predict(test_data_features)
#print(y_test.shape)
#print(sum(y_test))

#output5 = test
#output5 = output5.drop(columns=['review'])
#output5["sentiment"] = y_test
#print(output5)

In [None]:
# create the model
model6 = 0
model6 = Sequential()
model6.add(Embedding(num_words, embedding_size, input_length=max_tokens))
model6.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model6.add(MaxPooling1D(pool_size=2))
model6.add(Flatten())
model6.add(Dropout(0.2))
model6.add(Dense(50, activation='relu'))
model6.add(Dense(50, activation='relu'))
model6.add(Dense(50, activation='relu'))
model6.add(Dense(50, activation='relu'))
model6.add(Dense(50, activation='relu'))
model6.add(Dense(1, activation='sigmoid'))
model6.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model6.summary())

In [None]:
# To ensure results can be repeated, the same seed value should be used for all testing
seed_value= 2019

from numpy.random import seed
seed(seed_value)
# Additional seed value required to be set for tensorflow backend
from tensorflow import set_random_seed
set_random_seed(seed_value)

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

print("Seed values re-set to 2019.")

In [None]:

X_train = train_data_features
y_train = train["sentiment"]
print("xy values set")

# Fit the model
model6.fit(X_train, y_train, epochs=epoch_user, batch_size=batch_size_user, verbose=1)

# Use the random forest to make sentiment label predictions
#result = forest.predict(test_data_features)

y_test = model6.predict(test_data_features)
print(y_test.shape)
print(sum(y_test))

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column

output6 = test
output6 = output6.drop(columns=['review'])
output6["sentiment"] = y_test
print(output6)
#output = pd.DataFrame( data={"id":test["id"], "sentiment":y_test} )

# Use pandas to write the comma-separated output file
#output.to_csv( "model1.csv", index=False, quoting=3 )

In [None]:
# Use pandas to write the comma-separated output file
output6.to_csv( "model6.csv", index=False, quoting=3 )
print("model6.csv printed")