In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import string
import re
import nltk
import pandas as pd
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, MaxPool1D, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
## nltk.download('stopwords')




In [3]:
### Load doc into memory
def load_doc(filename):
    ## open file in read only
    file = open(filename,'r')
    ## read all text
    text=file.read()
    ## close the file
    file.close()
    return text

In [4]:
text = load_doc(r"C:\Users\Administrator.DAI-PC2\Downloads\Shubham\NLP\Day8\review_polarity-20240611T075242Z-001\review_polarity\txt_sentoken\neg\cv000_29416.txt")
text

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience membe

In [5]:
## Turn a doc into clean tokens
def clean_doc(doc):
    ## split into tokens by white space
    tokens = doc.split()
    ## prepare regex for character filtering
    re_punc = re.compile('[%s]'% re.escape(string.punctuation))
    ## remove punctuations from words
    tokens = [re_punc.sub("", w)for w in tokens]
    ## remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    ## filter out stopwords
    swords = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in swords]
    ## filter out short tokens
    tokens = [word for word in tokens if len(word)>1]
    return tokens

In [6]:
clean_doc(text)

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'whats',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mindfuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'didnt',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'whats',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',

In [8]:
## Using only 1 function for train and test
def process_docs(directory, is_train):
    documents = list()
    ## walk through all files in the folder
    for filename in listdir(directory):
        ## skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue   
        ## creat the full pth for the file to open
        path = directory+'/'+filename
        ## load the doc
        doc = load_doc(path)
        ## clean_doc
        tokens = clean_doc(doc)
        ## add to list
        documents.append(tokens)
    return documents

In [9]:
## load and clean a dataset
def load_clean_dataset(is_train):
    ## load document
    neg = process_docs(r"C:\Users\Administrator.DAI-PC2\Downloads\Shubham\NLP\Day8\review_polarity-20240611T075242Z-001\review_polarity\txt_sentoken\neg", is_train)     
    pos = process_docs(r"C:\Users\Administrator.DAI-PC2\Downloads\Shubham\NLP\Day8\review_polarity-20240611T075242Z-001\review_polarity\txt_sentoken\pos", is_train)         
    docs = neg + pos
    ## Prepare labels
    labels = [0 for _ in range(len(neg))]+[1 for _ in range(len(pos))]
    return docs, labels

In [10]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [11]:
def encode_docs(tokenizer, max_length, docs):
    ## Integer encoding
    encoded = tokenizer.texts_to_sequences(docs)
    ## pad sequence
    padded = pad_sequences(encoded, maxlen = max_length, padding = 'post')
    return padded

## Build Neural Network

In [12]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length = max_length))
    model.add(Conv1D(filters = 32, kernel_size=8, activation ='relu'))
    model.add(MaxPool1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation ='relu'))
    model.add(Dense(1, activation ='sigmoid'))   
    model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = 'accuracy')
    return model

In [54]:
train_docs, y_train = load_clean_dataset(True)
test_docs, y_test = load_clean_dataset(False)

print(len(train_docs), len(test_docs))

tokenizer = create_tokenizer(train_docs)

vocab_size = len(tokenizer.word_index)+1
print("Vocab size :", vocab_size)

max_length = max([len(s) for s in train_docs])
print("max length :", max_length)

x_train = encode_docs(tokenizer, max_length, train_docs)
x_test = encode_docs(tokenizer, max_length, test_docs)

x_train.shape, x_test.shape

1810 200
Vocab size : 44277
max length : 1380


((1810, 1380), (200, 1380))

In [14]:
model = define_model(vocab_size, max_length)

model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1380, 100)         4427700   
                                                                 
 conv1d (Conv1D)             (None, 1373, 32)          25632     
                                                                 
 max_pooling1d (MaxPooling1  (None, 686, 32)           0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 21952)             0         
                                                                 
 dense (Dense)               (None, 10)                219530    
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                     

In [15]:
## Fit model
model.fit(x_train, np.array(y_train), epochs=10, batch_size =10)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2ce7fea6890>

In [36]:
model.evaluate(x_test, np.array(y_test), batch_size=1)



[0.4763064682483673, 0.8349999785423279]

## Predict on unseen data

In [17]:
text1 = 'Best movie ever! It was great, I will definitely recommend it.'
text2 = 'This is a bad movie. Please dont watch it.'

In [49]:
def prediction(tokenizer,data):
    ## Clean doc
    clean = clean_doc(data)
    ## Tokenize the clean data 
    x_valid = encode_docs(tokenizer, max_length, [clean])
    ## Predict
    y_pred = model.predict(x_valid)
    ## Print prediction
    if y_pred[0][0]>0.5:
        print('POSITIVE : ', y_pred[0][0])
    else:
        print('NEGATIVE : ', 1-y_pred[0][0])      

In [50]:
prediction(tokenizer,text1)

POSITIVE :  0.58034384


In [51]:
prediction(tokenizer,text2)

NEGATIVE :  0.5416746139526367
