In [5]:
import keras

In [6]:
import string
import re
from numpy import array

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D

In [8]:
from os import listdir

In [9]:
#Load the doc

def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

In [16]:
#Turn a doc into clean tokens
def clean_doc(doc,vocab):
    tokens=doc.split()
    re_punc=re.compile('[%s]'%re.escape(string.punctuation))
    tokens=[re_punc.sub('',w) for w in tokens]
    tokens=[w for w in tokens if w in vocab]
    tokens=' '.join(tokens)
    return tokens

In [17]:
#load all docs in a directory
def process_docs(directory,vocab,is_train):
    documents=list()
    for filename in listdir(directory):
        
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path=directory+'/'+filename
    
        doc=load_doc(path)
    
        tokens=clean_doc(doc,vocab)
    
        documents.append(tokens)
    return documents
    

In [18]:
vocab_filename='C:/Users/Admin/OneDrive/NLP_Mphasis_MLA_Training/vocab1.txt'
vocab=load_doc(vocab_filename)
vocab=set(vocab.split())


In [19]:
#load the document

filename='C:/Users/Admin/OneDrive/NLP_Mphasis_MLA_Training/txt_sentoken/pos/cv000_29590.txt'
text=load_doc(filename)
tokens=clean_doc(text,vocab)
print(tokens)

films adapted comic books plenty success whether theyre superheroes batman superman spawn geared toward kids casper arthouse crowd ghost world theres never really comic book like hell starters created alan moore eddie campbell brought medium whole new level mid series called say moore campbell thoroughly subject jack ripper would like saying michael jackson starting look little odd book graphic novel pages long includes nearly consist nothing footnotes words dont dismiss film source get past whole comic book thing might find another stumbling block hells directors albert allen hughes getting hughes brothers direct seems almost ludicrous casting carrot top well anything better direct film thats set ghetto features really violent street crime mad geniuses behind menace ii society ghetto question course whitechapel londons east end filthy place whores called unfortunates starting get little nervous mysterious psychopath carving profession surgical precision first stiff turns copper peter 

In [20]:
#load and clean dataset

def load_clean_dataset(vocab,is_train):
    neg=process_docs('C:/Users/Admin/OneDrive/NLP_Mphasis_MLA_Training/txt_sentoken/neg',vocab,is_train)
    pos=process_docs('C:/Users/Admin/OneDrive/NLP_Mphasis_MLA_Training/txt_sentoken/pos',vocab,is_train)
    docs=neg+pos
    
    #prepare labels
    labels=array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs,labels

In [21]:
#fit a tokenizer
def create_tokenizer(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [22]:
#integer encode and pad documnets
def encode_docs(tokenizer,max_length,docs):
    #integer encode
    encoded=tokenizer.texts_to_sequences(docs)
    #pad sequences
    padded=pad_sequences(encoded,maxlen=max_length,padding='post')
    return padded
    

In [23]:
#Define the model

def model(vocab_size,max_length):
    
    model=Sequential()
    
    #input layer
    model.add(Embedding(vocab_size,100,input_length=max_length))
    
    #Hidden layer
    model.add(Conv1D(filters=32,kernel_size=8,activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    
    #flatten layer
    model.add(Flatten())
    model.add(Dense(10,activation='relu'))
    
    #output layer
    model.add(Dense(1,activation='sigmoid'))
    
    #Compile the network
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
   #summarize the model
    model.summary()
    #plot_model(model,to_file='model.png',show_shapes=True)
    
    return model

In [24]:
#load the vocabulary
vocab_filename='C:/Users/Admin/OneDrive/NLP_Mphasis_MLA_Training/vocab1.txt'
vocab=load_doc(vocab_filename)
vocab=set(vocab.split())


In [25]:
#Load training data
train_docs,ytrain=load_clean_dataset(vocab,True)


In [26]:
#create the tokenizer
tokenizer=create_tokenizer(train_docs)

In [27]:
#define the vocabulary size
vocab_size=len(tokenizer.word_index)+1
vocab_size

25768

In [28]:
#Calculate the maximum sequence length

max_length=max([len(s.split()) for s in train_docs])
print('Maximum length:%d'%max_length)


Maximum length:1317


In [29]:
#encode data
xtrain=encode_docs(tokenizer,max_length,train_docs)

In [30]:
type(xtrain),xtrain.shape

(numpy.ndarray, (1800, 1317))

In [31]:
#Train the network
model=model(vocab_size,max_length)
model.fit(xtrain,ytrain,epochs=10,verbose=2)
#save the model
model.save('nlp_model_1.h5')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1317, 100)         2576800   
                                                                 
 conv1d (Conv1D)             (None, 1310, 32)          25632     
                                                                 
 max_pooling1d (MaxPooling1  (None, 655, 32)           0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 20960)             0         
                                                                 
 dense (Dense)               (None, 10)                209610    
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                        

  saving_api.save_model(


# Evaluate Model

In [32]:
train_docs,ytrain=load_clean_dataset(vocab,True)
test_docs,ytest=load_clean_dataset(vocab,False)
tokenizer=create_tokenizer(train_docs)
vocab_size=len(tokenizer.word_index)+1
print('Vocabulary size: ',vocab_size)
#Calculate the maximum sequence length

max_length=max([len(s.split()) for s in train_docs])
print('Maximum length:%d'%max_length)
#encode data
xtrain=encode_docs(tokenizer,max_length,train_docs)
xtest=encode_docs(tokenizer,max_length,test_docs)

#load the model
_,acc=model.evaluate(xtrain,ytrain,verbose=0)
print('train Accuracy:\t',acc)
print()
_,acc=model.evaluate(xtest,ytest,verbose=0)
print('Test Accuracy:\t',acc)

Vocabulary size:  25768
Maximum length:1317
train Accuracy:	 1.0

Test Accuracy:	 0.8550000190734863


# classify a review as negative or positive


In [35]:
def predict_sentiment(review,vocab,tokenizer,max_length,model):
    line=clean_doc(review,vocab)
    padded=encode_docs(tokenizer,max_length,[line])
    
    ypred=model.predict(padded,verbose=0)
    percent_pos=ypred[0,0]
    if round(percent_pos)==0:
        return (1-percent_pos),'Negative'
    return percent_pos,'Positive'

In [36]:
text='Everyone will enjoy the flim.I love it,recommecnded'
predict_sentiment(text,vocab,tokenizer,max_length=max_length,model=model)

(0.729116290807724, 'Negative')

In [37]:
text='not good'
predict_sentiment(text,vocab,tokenizer,max_length=max_length,model=model)

(0.7327710092067719, 'Negative')