# Building a CNN Model to test Polarity
### Reference: https://realpython.com/python-keras-text-classification/

### Import Packages and Libraries

In [63]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import re
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import warnings
warnings.filterwarnings("ignore")

In [23]:
data_file = "SHOPEE_MAYBELLINE_CLEAN_V2.csv"
data = pd.read_csv(data_file)
data.columns = data.columns.str.strip().str.replace(" ","_")
# data.info()
# data.head()

# data.drop(columns=['Brand','Category','Product_Name','Price','Reviewer','Product_Purchase','Ratings','Date_Of_Review','Response', 'Topic'])
# review_list = data['Review'].tolist()
# polarity_list = data['Polarity'].tolist()

reviews = data['Review']
# polarity = data['Polarity']
# print (reviews)

review_docs = []
for each_reviews in reviews:
    temp = each_reviews.split(" ")
    review_docs.append(temp)
# print (review_docs)

# Make sure all words are in lowercase
reviews_lower = [[each_word.lower() for each_word in each_review] for each_review in review_docs]
# print (reviews_lower)

# Use regular expressions to keep only allphabetical words
reviews_alpha = [[each_word for each_word in each_review if re.search('^[a-z]+$', each_word)] for each_review in reviews_lower]
# print (reviews_alpha)

# Remove stop words
stop_list = stopwords.words('english')
reviews_stop = [[each_word for each_word in each_review if each_word not in stop_list] for each_review in reviews_alpha]
# print (reviews_stop)

# Porter Stemming
stemmer = PorterStemmer()
reviews_stem = [[stemmer.stem(each_word) for each_word in each_review] for each_review in reviews_stop]
# print (reviews_stem)

all_data_cleaned = []
for each_sentence in reviews_stem:
    sentence = ""
    for each_word in each_sentence:
        sentence += each_word + " "
    sentence = sentence[0:-1]
    all_data_cleaned.append(sentence)
print (all_data_cleaned)

polarity_raw = data['Polarity']
polarity_0_and_1 = []
for each_polarity in polarity_raw:
    if int(each_polarity) == int("0"):
        polarity_0_and_1.append(0.5)
    if int(each_polarity) == int("-1"):
        polarity_0_and_1.append(int(0))
    if int(each_polarity) == int("1"):
        polarity_0_and_1.append(int(1))
# print (polarity)


['fast deliveri bubbl wrap', 'yet tri', 'chooos colour dull dont like use anymor', 'get flash deal still worth price', 'ship took one week', 'pretti mix color wear lip heh thank', 'pack leak', 'handi travel', 'good deal consid brand', 'packag arriv sooner expect well packag well', 'wait test', 'overal good buy tri product', 'deliveri realli fast', 'pack small item fuss free neat', 'howev product realli sticki', 'tri colour particular one sticki even use makeup remov lip dri still sticki', 'prolli wont use', 'next day deliveri', 'fast ship excel product', 'worth buy', 'receiv good condit', 'arriv super day order', 'product cake well protect', 'fast deliveri got flash', 'deliv next say', 'fast delivedi item good condit', 'worth smell realli', 'smell good colour strong', 'receiv super quick nice', 'receiv fast', 'item authent', 'yet tri', 'bought sale good discount', 'pretti fast', 'receiv within work day bubbl wrap envelop', 'colour onlin imag true life', 'fast item receiv good condit', 

### Building a Model - Logistic Regression

In [24]:
reviews = all_data_cleaned
# polarity_negative1_and_1 = data['Polarity']
polarity = data['Polarity']
# print (polarity)
# print (reviews)
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(reviews)
vectorizer.vocabulary_
vectorizer.transform(reviews).toarray()

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)


vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

logRegClf = LogisticRegression()
logRegClf.fit(X_train, y_train)
logRegClfscore = logRegClf.score(X_test, y_test)

print ("Accuracy of Logistic Regression: ", logRegClfscore*100)

Accuracy:  81.19658119658119


### Building a Model - Logistic Regression with TFIDF

In [26]:
reviews = all_data_cleaned
# polarity_negative1_and_1 = data['Polarity']
polarity = data['Polarity']
# print (polarity)
# print (reviews)

logRegTFIDFclf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression())])

parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__norm': ('l1', 'l2')}

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

logRegTFIDFclf = GridSearchCV(logRegTFIDFclf, parameters, cv=10)
logRegTFIDFclf.fit(X_train, y_train)
logRegTFIDFscore = logRegTFIDFclf.score (X_test, y_test)

print ("Accuracy of Logistic Regression with TFIDF: ", logRegTFIDFscore*100)
logRegTFIDFclf.best_params_
# print(classification_report(y_test, mnbTFIDFclf.predict(X_test), digits=4))

Accuracy of Multinomial Naive Bayes with TFIDF:  77.77777777777779


{'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

### Build a Model - Naive Bayes (Multinomial/Gaussian)


In [27]:
reviews = all_data_cleaned
# polarity_negative1_and_1 = data['Polarity']
polarity = data['Polarity']
# print (polarity)
# print (reviews)
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(reviews)
vectorizer.vocabulary_
vectorizer.transform(reviews).toarray()

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

mnbClf = MultinomialNB()
mnbClf.fit(X_train, y_train)
mnbClfscore = mnbClf.score (X_test, y_test)

print ("Accuracy of Multinomial Naive Bayes: ", mnbClfscore*100)

bnbClf = BernoulliNB()
bnbClf.fit(X_train, y_train)
bnbClfscore = bnbClf.score (X_test, y_test)

print ("Accuracy of Bernoulli Naive Bayes: ", bnbClfscore*100)

Accuracy of Multinomial Naive Bayes:  81.19658119658119
Accuracy of Bernoulli Naive Bayes:  74.35897435897436


### Build a Model - Multinomial NB with TFIDF

In [28]:
reviews = all_data_cleaned
# polarity_negative1_and_1 = data['Polarity']
polarity = data['Polarity']
# print (polarity)
# print (reviews)

mnbTFIDFclf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__norm': ('l1', 'l2'),
              'clf__alpha': [1, 1e-1, 1e-2]}

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

mnbTFIDFclf = GridSearchCV(mnbTFIDFclf, parameters, cv=10)
mnbTFIDFclf.fit(X_train, y_train)
mnbTFIDFscore = mnbTFIDFclf.score (X_test, y_test)

print ("Accuracy of Multinomial Naive Bayes with TFIDF: ", mnbTFIDFscore*100)
mnbTFIDFclf.best_params_
# print(classification_report(y_test, mnbTFIDFclf.predict(X_test), digits=4))

Accuracy of Multinomial Naive Bayes with TFIDF:  82.90598290598291


{'clf__alpha': 0.1,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1)}

### Building a Model - SVM

In [62]:
reviews = all_data_cleaned
# polarity_negative1_and_1 = data['Polarity']
polarity = data['Polarity']
# print (polarity)
# print (reviews)

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(reviews)
vectorizer.vocabulary_
vectorizer.transform(reviews).toarray()

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

parameters = {'C':[1,2,3,4,5,6,7,8,14], 
              'gamma':[0.1, 0.01, 0.001, 0.0001], 
              'kernel':['linear', 'poly', 'rbf'], 
              'degree': [1,2,3,4,5]}

svmClf = GridSearchCV(param_grid = parameters, 
                      estimator= SVC(), 
                      scoring='accuracy', 
                      refit= True, 
                      verbose=1)

svmClf.fit(X_train, y_train)
svmClfscore = svmClf.score(X_test, y_test)

print ('Accuracy of SVM: ', svmClfscore*100)
svmClf.best_params_

Fitting 3 folds for each of 540 candidates, totalling 1620 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy of SVM:  82.05128205128204


[Parallel(n_jobs=1)]: Done 1620 out of 1620 | elapsed:    7.6s finished


{'C': 3, 'degree': 1, 'gamma': 0.1, 'kernel': 'rbf'}

### Building a Model - SVM with TFIDF

In [43]:
reviews = all_data_cleaned
# polarity_negative1_and_1 = data['Polarity']
polarity = data['Polarity']
# print (polarity)
# print (reviews)

svmTFIDFclf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC(C=3, degree=1, gamma=0.1, kernel='rbf'))])

# parameters = {'C':[1,2,3,4,5,6,7,8,14], 
#               'gamma':[0.1, 0.01, 0.001, 0.0001], 
#               'kernel':['linear', 'poly', 'rbf'], 
#               'degree': [1,2,3,4,5]}

parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__norm': ('l1', 'l2')}

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

svmTFIDFclf = GridSearchCV(svmTFIDFclf, parameters, cv=10)
svmTFIDFclf.fit(X_train, y_train)
svmTFIDFscore = svmTFIDFclf.score(X_test, y_test)

print ("Accuracy of SVM with TFIDF: ", svmTFIDFscore*100)
svmTFIDFclf.best_params_

Accuracy of SVM with TFIDF:  77.77777777777779


{'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

### Building a Model - SVM with PCA

In [65]:
reviews = all_data_cleaned
# polarity_negative1_and_1 = data['Polarity']
polarity = data['Polarity']
# print (polarity)
# print (reviews)

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(reviews)
vectorizer.vocabulary_
vectorizer.transform(reviews).toarray()

pca = PCA(n_components = 10)
principalComponents = pca.fit_transform(reviews)

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

parameters = {'C':[1,2,3,4,5,6,7,8,14], 
              'gamma':[0.1, 0.01, 0.001, 0.0001], 
              'kernel':['linear', 'poly', 'rbf'], 
              'degree': [1,2,3,4,5]}

svmClf = GridSearchCV(param_grid = parameters, 
                      estimator= SVC(), 
                      scoring='accuracy', 
                      refit= True, 
                      verbose=1)

svmClf.fit(X_train, y_train)
svmClfscore = svmClf.score(X_test, y_test)

print ('Accuracy of SVM: ', svmClfscore*100)
svmClf.best_params_

ValueError: could not convert string to float: 'fast deliveri bubbl wrap'

### Building a Model - ANN

In [56]:
reviews = all_data_cleaned
polarity = polarity_0_and_1
# polarity = data['Polarity']
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(reviews)
vectorizer.vocabulary_
vectorizer.transform(reviews).toarray()

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

input_dim = X_train.shape[1]

In [57]:
def create_model():
    # create model
    model = tf.keras.Sequential()
    model.add(layers.Dense(16, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(layers.Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(layers.Dense(2, activation='softmax'))
    
    # compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
    return model

num_epochs = 50
batch_size = 10

history = model.fit(X_train, y_train, epochs = num_epochs, validation_data = (X_test, y_test), batch_size = batch_size)

Train on 349 samples, validate on 117 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [60]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
ANNscore = accuracy
print ("Accuracy of ANN:", ANNscore*100)


Accuracy of ANN: 77.77777910232544


### Overview of All Accuracy

In [61]:
print ("Accuracy of Logistic Regression: ", logRegClfscore*100)
print ("Accuracy of Logistic Regression with TFIDF: ", logRegTFIDFscore*100)
print ("Accuracy of Multinomial Naive Bayes: ", mnbClfscore*100)
print ("Accuracy of Multinomial Naive Bayes with TFIDF: ", mnbTFIDFscore*100)
print ("Accuracy of Bernoulli Naive Bayes: ", bnbClfscore*100)
print ('Accuracy of SVM: ', svmClfscore*100)
print ("Accuracy of SVM with TFIDF: ", svmTFIDFscore*100)
print ("Accuracy of ANN:", ANNscore*100)


Accuracy of Logistic Regression:  81.19658119658119
Accuracy of Logistic Regression with TFIDF:  77.77777777777779
Accuracy of Multinomial Naive Bayes:  81.19658119658119
Accuracy of Multinomial Naive Bayes with TFIDF:  82.90598290598291
Accuracy of Bernoulli Naive Bayes:  74.35897435897436
Accuracy of SVM:  82.05128205128204
Accuracy of SVM with TFIDF:  77.77777777777779
Accuracy of ANN: 77.77777910232544


Baseline Model: Naive Bayes - so we tried both Multinomial and Bernoulli
Then tried a few more models like logistic regression, SVM, ..., ...
Then tried MNB with TFIDF

can try PCA word2vec doc2vec

Why TFIDF reduce accuracy? https://datascience.stackexchange.com/questions/13660/in-general-when-does-tf-idf-reduce-accuracy

### Building a Model - First Keras Model

In [4]:
# print (type(reviews))

reviews = all_data_cleaned
polarity = polarity_0_and_1
# polarity = data['Polarity']
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(reviews)
vectorizer.vocabulary_
vectorizer.transform(reviews).toarray()

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

input_dim = X_train.shape[1]
# print (input_dim)
# input_shape = (len(X_train, ))
# print (input_shape)
model = tf.keras.Sequential()
model.add(layers.Dense(10, input_dim = input_dim , activation='tanh'))
# model.add(layers.Dense(10, input_dim = input_dim , activation='sigmoid'))
# model.add(layers.Dense(1, input_dim = input_shape , activation='relu'))
model.add(layers.Dense(5, activation = 'softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                3110      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 55        
Total params: 3,165
Trainable params: 3,165
Non-trainable params: 0
_________________________________________________________________


In [5]:
model.compile(optimizer=tf.optimizers.Adam(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [6]:
num_epochs = 50
batch_size = 10

history = model.fit(X_train, y_train, epochs = num_epochs, validation_data = (X_test, y_test), batch_size = batch_size)

Train on 349 samples, validate on 117 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [7]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))

### num_epochs = 100
# tanh > softmax == 0.7863
# sigmoid > softmax == 0.7778
# tanh > sigmoid > softmax == 0.7778
# tanh > relu > softmax == 0.7607
# tanh > tanh > softmax == 0.7607
# relu > tanh > softmax == 0.7521

### num_epochs = 50
# tanh > softmax == 0.7863
# sigmoid > softmax == 0.7778
# tanh > sigmoid > softmax == 0.7949
# tanh > relu > softmax == 0.7863
# tanh > tanh > softmax == 0.7436
# relu > tanh > softmax == 0.7778

Training Accuracy: 0.8424
Testing Accuracy: 0.7778


### Building a Model - with Word Embeddings

In [9]:
reviews = all_data_cleaned
# print (reviews)
polarity = polarity_0_and_1
# print (polarity)


X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=1000)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
# y_train= tokenizer.texts_to_sequences(y_train)
# y_test = tokenizer.texts_to_sequences(y_test)

vocab_size = len(tokenizer.word_index) + 1

# print (X_train[0:5])
# print (X_train_num[0:5])

maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [10]:
embedding_dim = 50
model2 = tf.keras.Sequential()
model2.add(layers.Embedding(input_dim = vocab_size,
                            output_dim = embedding_dim,
                            input_length = maxlen))
model2.add(layers.Flatten())
model2.add(layers.Dense(10, activation='relu'))
model2.add(layers.Dense(1, activation='sigmoid'))
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           15600     
_________________________________________________________________
flatten (Flatten)            (None, 5000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                50010     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 65,621
Trainable params: 65,621
Non-trainable params: 0
_________________________________________________________________


In [11]:
model2.compile(optimizer=tf.optimizers.Adam(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [12]:
num_epochs = 50
batch_size = 10

history2 = model2.fit(X_train, y_train, epochs = num_epochs, validation_data = (X_test, y_test), batch_size = batch_size)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'float'>", "<class 'int'>"})

### Building a Model - Using Pretrained Word Embeddings
#### Download the GloVe
#### Another alternative is to train your own word embeddings with the gemsim python package

In [13]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [16]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B/glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2273: character maps to <undefined>

In [17]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
covered_vocabulary = nonzero_elements / vocab_size
print (covered_vocabulary)

NameError: name 'embedding_matrix' is not defined

### Building a Model - CNN

In [33]:
# print (type(reviews))

reviews = all_data_cleaned
polarity = polarity_0_and_1
# polarity = data['Polarity']
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(reviews)
vectorizer.vocabulary_
vectorizer.transform(reviews).toarray()

X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)


vocab_size = len(tokenizer.word_index) + 1
print (vocab_size)
maxlen = 310
# X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
# X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
embedding_dim = 50
# print (input_dim)
# input_shape = (len(X_train, ))
# print (input_shape)
modelCNN = tf.keras.Sequential()
modelCNN.add(layers.Embedding(input_dim = vocab_size,
                           output_dim = embedding_dim,
                           input_length = maxlen))
modelCNN.add(layers.Flatten())
# modelCNN.add(layers.GlobalMaxPool1D())
modelCNN.add(layers.Dense(10, activation='relu'))
modelCNN.add(layers.Dense(1, activation='sigmoid'))
# model.add(layers.Dense(10, input_dim = input_dim , activation='sigmoid'))
# model.add(layers.Dense(1, input_dim = input_shape , activation='relu'))
modelCNN.add(layers.Dense(5, activation = 'softmax'))
modelCNN.summary()

312
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 310, 50)           15600     
_________________________________________________________________
flatten_5 (Flatten)          (None, 15500)             0         
_________________________________________________________________
dense_19 (Dense)             (None, 10)                155010    
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 11        
_________________________________________________________________
dense_21 (Dense)             (None, 5)                 10        
Total params: 170,631
Trainable params: 170,631
Non-trainable params: 0
_________________________________________________________________


In [34]:
modelCNN.compile(optimizer=tf.optimizers.Adam(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = modelCNN.fit(X_train, y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

