In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
import string
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense,Dropout
from collections import Counter
import matplotlib.pyplot as plt

In [23]:
def load_doc(filename):
  file=open(filename,'r')
  text=file.read()
  file.close()
  return text
def clean_doc(doc):
  doc=doc.split()
  re_punc=re.compile('[%s]' %re.escape(string.punctuation))
  tokens=[re_punc.sub('',w) for w in doc]
  re_html=re.compile('<.*?>')
  tokens=[re_html.sub('',w) for w in tokens]
  tokens=[word for word in tokens if word.isalpha()]
  stop_words=set(stopwords.words('english'))
  tokens=[w for w in tokens if not w in stop_words]
  tokens=[word for word in tokens if len(word) > 1]
  return tokens

In [24]:
def doc_to_line(filename,vocab):
  doc=load_doc(filename)
  tokens=clean_doc(doc)
  tokens=[w for w in tokens if w in vocab]
  return ' '.join(tokens)

In [25]:
def process_docs(directory,vocab):
  lines=list()
  for filename in os.listdir(directory):
    path =directory+'/'+filename
    line=doc_to_line(path,vocab)
    lines.append(line)
  return lines

In [26]:
def load_clean_dataset(vocab):
  neg=process_docs('txt_sentoken/neg',vocab)
  pos=process_docs('txt_sentoken/pos',vocab)
  docs=neg+pos
  labels=[0 for _ in range(len(neg))] +[1 for _ in range(len(pos))]
  return np.array(docs),np.array(labels)

In [27]:
def prepare_data(train_docs,test_docs,mode):
  tokenizer=Tokenizer()
  tokenizer.fit_on_texts(train_docs)
  X_train=tokenizer.texts_to_matrix(train_docs,mode=mode)
  return X_train

In [28]:
def define_models(n_words):
  model=Sequential()
  model.add(Dense(35,input_shape=(n_words,),activation='relu'))
  model.add(Dropout(.5))
  model.add(Dense(1,activation='sigmoid'))
  model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
  plot_model(model,to_file='model.png',show_shapes=True)
  return model

In [29]:
def predict_sentiment(review,vocab,tokeninzer,model):
  tokens=clean_doc(review)
  tokens=[w for w in tokens if w in vocab]
  line=' '.join(tokens)
  encoded=tokenizer.texts_to_matrix([line],mode='binary')
  yhat=model.predict(encoded,verbose=0)
  print('yhat value:',yhat)
  print(yhat[0])
  percent_pos=yhat[0,0]
  print(percent_pos)
  if round(percent_pos) == 0:
    return (1-percent_pos), 'Negative'
  return percent_pos, 'Positive'

In [32]:
vocab_filename='vocab.txt'
vocab=load_doc(vocab_filename)
vocab=set(vocab.split())
train_docs,ytrain=load_clean_dataset(vocab)
tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_docs)
X_train=tokenizer.texts_to_matrix(train_docs,mode='binary')
n_words=X_train.shape[1]
model=define_models(n_words)
model.fit(X_train,ytrain,epochs=10,verbose=0)

<keras.callbacks.History at 0x7fd97b9e31d0>

In [33]:
model.save('model_without_embedding.h5',save_format='h5')

In [34]:
text='Best movie ever! It was great,I recommend it'
text2='This is bad movie'
percent,sentiment=predict_sentiment(text,vocab,tokenizer,model)
print('Review: [%s]\n Sentiment: %s (%.3f%%)' %(text,sentiment,percent*100))
percent,sentiment=predict_sentiment(text2,vocab,tokenizer,model)
print('Review: [%s]\n Sentiment: %s (%.3f%%)' %(text,sentiment,percent*100))

yhat value: [[0.49587312]]
[0.49587312]
0.49587312
Review: [Best movie ever! It was great,I recommend it]
 Sentiment: Negative (50.413%)
yhat value: [[0.33444047]]
[0.33444047]
0.33444047
Review: [Best movie ever! It was great,I recommend it]
 Sentiment: Negative (66.556%)
