# Convolutional Neural Network

In [76]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [77]:
csv = 'touristsentimentpn.csv'
my_df = pd.read_csv(csv,index_col=0)
my_df.head()

Unnamed: 0_level_0,location,review,sentiment
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Syambhunath,It is at the top of valleys mountain. Best pl...,1
2,Syambhunath,This place has a significant importance in Bud...,1
3,Syambhunath,Visited this from the other side on a rainy ev...,1
4,Syambhunath,A beautiful temple situated in the capital wit...,1
5,Syambhunath,"great, beautiful, historic & religious place.....",1


In [78]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5401 entries, 0 to 5400
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   location   5401 non-null   object
 1   review     5401 non-null   object
 2   sentiment  5401 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 126.7+ KB


In [79]:
x = my_df.review
y = my_df.sentiment

In [80]:
from sklearn.model_selection import train_test_split

SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [81]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

In [82]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [83]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

In [84]:
cores = multiprocessing.cpu_count()
model_ug_cbow = Word2Vec(sg=0, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 1192599.94it/s]


In [85]:
%%time
for epoch in range(30):
    model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_cbow.alpha -= 0.002
    model_ug_cbow.min_alpha = model_ug_cbow.alpha

100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 1542309.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 5442920.69it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 1821601.47it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 2650764.79it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 2698443.82it/s]
100%|███████████████████████████████████

Wall time: 1.67 s


In [86]:
model_ug_sg = Word2Vec(sg=1,  negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████████████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<?, ?it/s]


In [87]:
%%time
for epoch in range(30):
    model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_sg.alpha -= 0.002
    model_ug_sg.min_alpha = model_ug_sg.alpha

100%|██████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 794606.47it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 5612843.39it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 2700373.81it/s]
100%|██████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 369556.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 1282390.94it/s]
100%|██████████████████████████████████████████████████████████████████████████| 5401/5401 [00:00<00:00, 631754.03it/s]
100%|███████████████████████████████████

Wall time: 2.06 s


In [88]:
model_ug_cbow.save('w2v_model_ug_cbow.word2vec')
model_ug_sg.save('w2v_model_ug_sg.word2vec')

Let's load the Word2Vec models to extract word vectors from. 

In [89]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')

In [90]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [91]:
len(tokenizer.word_index)


6326

In [92]:
#first five entries of original data
for x in x_train[:5]:
    print (x)

The lovely and enchanted nation of Nepal never disappoints, with its thick wildernesses, overcast mountain ridges, extraordinary untamed life, interesting societies and sparkling geography. The lakes, flickering like mirrors and mirroring …
Giving this place a rating of 5/5

The place is very peaceful due to restrictions on vehicle movements. The place …
Very well managed and a great place carrying religious values
It is one of the most important place of chitwan. It's tourism area of chitwan. Tourists are more visit chitwan national parks.
Beautiful, rural and untouched corner of Nepal! Amazing local Tamang culture with lovely locals ready to race to invite you for a cup of tea. Definitely worth the travel


And the same data prepared as sequential data as below


In [93]:

sequences[:5]


[[1,
  145,
  2,
  2162,
  1750,
  4,
  20,
  299,
  3055,
  17,
  61,
  3056,
  3057,
  3058,
  175,
  2163,
  1501,
  2164,
  96,
  300,
  3059,
  2,
  2165,
  3060,
  1,
  525,
  3061,
  98,
  1502,
  2,
  3062,
  23],
 [1751,
  14,
  3,
  7,
  2166,
  4,
  410,
  410,
  1,
  3,
  6,
  16,
  50,
  690,
  5,
  1752,
  44,
  910,
  3063,
  1,
  3,
  23],
 [16, 91, 350, 2, 7, 30, 3, 3064, 115, 720],
 [13,
  6,
  15,
  4,
  1,
  38,
  214,
  3,
  4,
  139,
  35,
  313,
  102,
  4,
  139,
  148,
  25,
  82,
  10,
  139,
  51,
  597],
 [12,
  3065,
  2,
  911,
  1079,
  4,
  20,
  29,
  136,
  1190,
  117,
  17,
  145,
  393,
  1191,
  5,
  2167,
  5,
  3066,
  9,
  11,
  7,
  1310,
  4,
  576,
  328,
  140,
  1,
  472]]

Each word is represented as a number, and we can see that the number of words in each sentence is matching the length of numbers in the "sequences".

In [94]:
length = []
for x in x_train:
    length.append(len(x.split()))

In [95]:
max(length)


70

In [96]:
x_train_seq = pad_sequences(sequences, maxlen=45)
print('Shape of data tensor:', x_train_seq.shape)

Shape of data tensor: (5292, 45)


In [97]:
x_train_seq[:5]


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    1,  145,    2, 2162, 1750,    4,   20,  299, 3055,
          17,   61, 3056, 3057, 3058,  175, 2163, 1501, 2164,   96,  300,
        3059,    2, 2165, 3060,    1,  525, 3061,   98, 1502,    2, 3062,
          23],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 1751,   14,    3,    7, 2166,    4,  410,  410,    1,    3,
           6,   16,   50,  690,    5, 1752,   44,  910, 3063,    1,    3,
          23],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   16,   91,  350,    2,    7,   30,    3, 3064,  115,
         720],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,

In [98]:
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=45)

In [99]:
num_words = 100000
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [100]:
np.array_equal(embedding_matrix[6] ,embeddings_index.get('you'))


False

In [101]:
seed = 7

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [102]:
model_ptw2v = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=False)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(256, activation='relu'))
model_ptw2v.add(Dense(1, activation='sigmoid'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

Epoch 1/5
166/166 - 2s - loss: 0.6570 - accuracy: 0.9545 - val_loss: 0.6183 - val_accuracy: 0.9815 - 2s/epoch - 10ms/step
Epoch 2/5
166/166 - 1s - loss: 0.5903 - accuracy: 0.9601 - val_loss: 0.5529 - val_accuracy: 0.9815 - 1s/epoch - 8ms/step
Epoch 3/5
166/166 - 1s - loss: 0.5324 - accuracy: 0.9601 - val_loss: 0.4960 - val_accuracy: 0.9815 - 1s/epoch - 8ms/step
Epoch 4/5
166/166 - 1s - loss: 0.4823 - accuracy: 0.9601 - val_loss: 0.4465 - val_accuracy: 0.9815 - 1s/epoch - 8ms/step
Epoch 5/5
166/166 - 1s - loss: 0.4391 - accuracy: 0.9601 - val_loss: 0.4035 - val_accuracy: 0.9815 - 1s/epoch - 8ms/step


<keras.callbacks.History at 0x27c94844880>

In [103]:
model_ptw2v = Sequential()
e = Embedding(100000, 200, input_length=45)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(256, activation='relu'))
model_ptw2v.add(Dense(1, activation='sigmoid'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

Epoch 1/5
166/166 - 30s - loss: 0.0535 - accuracy: 0.9739 - val_loss: 0.0131 - val_accuracy: 1.0000 - 30s/epoch - 179ms/step
Epoch 2/5
166/166 - 29s - loss: 0.0266 - accuracy: 1.0000 - val_loss: 0.0122 - val_accuracy: 1.0000 - 29s/epoch - 174ms/step
Epoch 3/5
166/166 - 29s - loss: 0.0249 - accuracy: 1.0000 - val_loss: 0.0112 - val_accuracy: 1.0000 - 29s/epoch - 174ms/step
Epoch 4/5
166/166 - 29s - loss: 0.0232 - accuracy: 1.0000 - val_loss: 0.0104 - val_accuracy: 1.0000 - 29s/epoch - 174ms/step
Epoch 5/5
166/166 - 29s - loss: 0.0216 - accuracy: 1.0000 - val_loss: 0.0098 - val_accuracy: 1.0000 - 29s/epoch - 174ms/step


<keras.callbacks.History at 0x27c84c57250>

In [104]:
model_ptw2v = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=True)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(256, activation='relu'))
model_ptw2v.add(Dense(1, activation='sigmoid'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

Epoch 1/5
166/166 - 28s - loss: 0.6563 - accuracy: 0.9548 - val_loss: 0.6173 - val_accuracy: 0.9815 - 28s/epoch - 171ms/step
Epoch 2/5
166/166 - 28s - loss: 0.5893 - accuracy: 0.9601 - val_loss: 0.5519 - val_accuracy: 0.9815 - 28s/epoch - 167ms/step
Epoch 3/5
166/166 - 28s - loss: 0.5315 - accuracy: 0.9601 - val_loss: 0.4950 - val_accuracy: 0.9815 - 28s/epoch - 171ms/step
Epoch 4/5
166/166 - 28s - loss: 0.4815 - accuracy: 0.9601 - val_loss: 0.4456 - val_accuracy: 0.9815 - 28s/epoch - 166ms/step
Epoch 5/5
166/166 - 28s - loss: 0.4383 - accuracy: 0.9601 - val_loss: 0.4029 - val_accuracy: 0.9815 - 28s/epoch - 167ms/step


<keras.callbacks.History at 0x27c8579bfa0>

# Convolutional Neural Network

In [105]:
from keras.layers import Conv1D, GlobalMaxPooling1D


In [106]:
structure_test = Sequential()
e = Embedding(100000, 200, input_length=45)
structure_test.add(e)
structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
structure_test.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 45, 200)           20000000  
                                                                 
 conv1d_8 (Conv1D)           (None, 44, 100)           40100     
                                                                 
Total params: 20,040,100
Trainable params: 20,040,100
Non-trainable params: 0
_________________________________________________________________


In [107]:
structure_test = Sequential()
e = Embedding(100000, 200, input_length=45)
structure_test.add(e)
structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
structure_test.add(GlobalMaxPooling1D())
structure_test.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 45, 200)           20000000  
                                                                 
 conv1d_9 (Conv1D)           (None, 44, 100)           40100     
                                                                 
 global_max_pooling1d_7 (Glo  (None, 100)              0         
 balMaxPooling1D)                                                
                                                                 
Total params: 20,040,100
Trainable params: 20,040,100
Non-trainable params: 0
_________________________________________________________________


In [108]:
model_cnn_01 = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=False)
model_cnn_01.add(e)
model_cnn_01.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_01.add(GlobalMaxPooling1D())
model_cnn_01.add(Dense(256, activation='relu'))
model_cnn_01.add(Dense(1, activation='sigmoid'))
model_cnn_01.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_01.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

Epoch 1/5
166/166 - 1s - loss: 0.6568 - accuracy: 0.9545 - val_loss: 0.6181 - val_accuracy: 0.9815 - 1s/epoch - 8ms/step
Epoch 2/5
166/166 - 1s - loss: 0.5899 - accuracy: 0.9601 - val_loss: 0.5525 - val_accuracy: 0.9815 - 810ms/epoch - 5ms/step
Epoch 3/5
166/166 - 1s - loss: 0.5319 - accuracy: 0.9601 - val_loss: 0.4955 - val_accuracy: 0.9815 - 671ms/epoch - 4ms/step
Epoch 4/5
166/166 - 1s - loss: 0.4818 - accuracy: 0.9601 - val_loss: 0.4461 - val_accuracy: 0.9815 - 673ms/epoch - 4ms/step
Epoch 5/5
166/166 - 1s - loss: 0.4386 - accuracy: 0.9601 - val_loss: 0.4031 - val_accuracy: 0.9815 - 657ms/epoch - 4ms/step


<keras.callbacks.History at 0x27c86171e20>

In [109]:
model_cnn_02 = Sequential()
e = Embedding(100000, 200, input_length=45)
model_cnn_02.add(e)
model_cnn_02.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_02.add(GlobalMaxPooling1D())
model_cnn_02.add(Dense(256, activation='relu'))
model_cnn_02.add(Dense(1, activation='sigmoid'))
model_cnn_02.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_02.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

Epoch 1/5
166/166 - 29s - loss: 0.0743 - accuracy: 0.9743 - val_loss: 0.0410 - val_accuracy: 0.9815 - 29s/epoch - 172ms/step
Epoch 2/5
166/166 - 28s - loss: 1.2319e-04 - accuracy: 1.0000 - val_loss: 0.0566 - val_accuracy: 0.9815 - 28s/epoch - 166ms/step
Epoch 3/5
166/166 - 27s - loss: 2.5551e-05 - accuracy: 1.0000 - val_loss: 0.0655 - val_accuracy: 0.9815 - 27s/epoch - 166ms/step
Epoch 4/5
166/166 - 28s - loss: 1.2332e-05 - accuracy: 1.0000 - val_loss: 0.0706 - val_accuracy: 0.9815 - 28s/epoch - 169ms/step
Epoch 5/5
166/166 - 29s - loss: 7.3691e-06 - accuracy: 1.0000 - val_loss: 0.0749 - val_accuracy: 0.9815 - 29s/epoch - 173ms/step


<keras.callbacks.History at 0x27c8632c400>

In [110]:
model_cnn_03 = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=True)
model_cnn_03.add(e)
model_cnn_03.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_03.add(GlobalMaxPooling1D())
model_cnn_03.add(Dense(256, activation='relu'))
model_cnn_03.add(Dense(1, activation='sigmoid'))
model_cnn_03.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_03.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

Epoch 1/5
166/166 - 30s - loss: 0.6570 - accuracy: 0.9552 - val_loss: 0.6182 - val_accuracy: 0.9815 - 30s/epoch - 179ms/step
Epoch 2/5
166/166 - 32s - loss: 0.5903 - accuracy: 0.9601 - val_loss: 0.5529 - val_accuracy: 0.9815 - 32s/epoch - 193ms/step
Epoch 3/5
166/166 - 32s - loss: 0.5324 - accuracy: 0.9601 - val_loss: 0.4959 - val_accuracy: 0.9815 - 32s/epoch - 190ms/step
Epoch 4/5
166/166 - 35s - loss: 0.4823 - accuracy: 0.9601 - val_loss: 0.4464 - val_accuracy: 0.9815 - 35s/epoch - 211ms/step
Epoch 5/5
166/166 - 34s - loss: 0.4390 - accuracy: 0.9601 - val_loss: 0.4034 - val_accuracy: 0.9815 - 34s/epoch - 204ms/step


<keras.callbacks.History at 0x27c8645e3a0>