Date : 15/12/2023

## ANN for Sentiment Analysis

<img src="" alt="">

In [47]:
### --------------------
### Importing Librarires
### --------------------

import string
import re


from os import listdir


from nltk.corpus import stopwords
import nltk


from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense


import numpy as np

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/dai/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Defining the function to read files one by one

In [4]:
# load doc into memory

def load_doc(filename):
    
    # open the file as read only
    file = open(filename, 'r')
    
    #read all text
    text = file.read()
    
    # close the file
    file.close()
    
    return text

In [8]:
# tetsting the 'load_doc' function

# reading a random file from the directory

text = load_doc('review_polarity/txt_sentoken/pos/cv026_29325.txt')

text



In [11]:
#Turn a doc into clean token
def clean_doc(doc):
    #Split ino tokens by white space
    tokens=doc.split()
    #Prepare regax for char filtering
    re_punc=re.compile('[%s]' % re.escape(string.punctuation))
    #remove punctuation from each words
    tokens=[re_punc.sub('',w)for w in tokens]
    #remove remaining tokens that are not alphabetic
    tokens=[word for word in tokens if word.isalpha()]
    #filter out stop words
    stop_words=set(stopwords.words('english'))
    tokens=[w for w in tokens if not w in stop_words]
    #Filter out short tokens
    tokens=[word for word in tokens if len(word)>1]
    return tokens

In [14]:
clean=clean_doc(text)
len(clean)

261

In [19]:
# load doc, clean and return line of tokens
def doc_to_line(filename,vocab):
    #load the doc
    doc=load_doc(filename)
    #clean doc
    tokens=clean_doc(doc)
    #Filter the vocab
    tokens=[w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [20]:
vocab=open('vocab.txt')
vocab=vocab.read().split()

In [21]:
doc_to_line('review_polarity/txt_sentoken/pos/cv026_29325.txt',vocab)



In [23]:
#Seprating traing and test data
def process_train(directory,vocab):
    documents=list()
     # to check files in directory
    for filename in listdir(directory):
        if not filename.startswith('cv9'):
            path=directory + '/'+filename
            doc=load_doc(path)
            tokens=clean_doc(doc)
            documents.append(tokens)
    return documents
    
    
# Test data
def process_test(directory,vocab):
    documents=list()
     # to check files in directory
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            path=directory + '/'+filename
            doc=load_doc(path)
            tokens=clean_doc(doc)
            documents.append(tokens)
    return documents

### Function to read all the docs

# load all docs in a directory

def process_docs(directory, vocab, is_train):
    
    documents = list()
    
    
    
    # walk through all files in the folder
    for filename in listdir(directory):
        
        
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            
            continue
            
        if not is_train and not filename.startswith('cv9'):
            
            continue
            
            
            
        # create the full path of the file to oprn
        path = directory + '/' + filename

        # load the doc
        doc = load_doc(path)

        # clean doc
        tokens = clean_doc(doc)

        # add to list
        documents.append(tokens)
        
        

    return documents

In [36]:
def process_docs(directory,vocab,is_train):
    documents = list()
    #walks through all the files in the folder
    for filename in listdir(directory):
        #skip any reviews in test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        #create a full path of the file to open
        path = directory + '/'+filename
        #load the doc
        doc = load_doc(path)
        #clean the data
        tokens = clean_doc(doc)
        #load the data
        documents.append(tokens)
    return documents

In [37]:
lines = process_docs('review_polarity/review_polarity/txt_sentoken/pos', vocab, False)

len(lines)

100

In [42]:
# load and clean a dataset


def load_clean_dataset(vocab, is_train):
    
    # load documents
    neg = process_docs('review_polarity/review_polarity/txt_sentoken/neg',
                      vocab, is_train)
    
    pos = process_docs('review_polarity/review_polarity/txt_sentoken/pos',
                      vocab, is_train)
    
    docs = neg + pos
    
    # prepare labels
    
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    
    return docs, labels

In [43]:
train, train_labels = load_clean_dataset(vocab, True)
test, test_labels = load_clean_dataset(vocab, False)

In [44]:
len(train), len(train_labels)

(1800, 1800)

In [45]:
len(test), len(test_labels)

(200, 200)

## Build ANN model

In [49]:
# define the model



def define_model(n_words):
    
    
    
    # define network
    
    model = Sequential()
    
    model.add(Dense(50,
                   input_shape = (n_words,),
                   activation = 'relu'
                   )
             )
    
    model.add(Dense(1,
                   activation = 'sigmoid'
                   )
             )
    
    
    
    # compile network
    
    model.compile(loss = 'binary_crossentropy',
                 optimizer = 'adam',
                 metrics = ['accuracy']
                 )
    
    
    
    # summarize the defined model
    
    model.summary()
    
    
    
    
    # Plotting the model
    
    plot_model(model, 
              to_file = 'model.png',
              show_shapes = True
              )

    
    
    return model

In [51]:
# model call

define_model(100)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 50)                5050      
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
Total params: 5101 (19.93 KB)
Trainable params: 5101 (19.93 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


<keras.src.engine.sequential.Sequential at 0x7fa6973015d0>

In [58]:
# fit a tokenizer

def create_tokenizer(lines):
    
    tokenizer = Tokenizer()
    
    tokenizer.fit_on_texts(lines)
    
    return tokenizer



# Create the tokenizer

tokenizer = create_tokenizer(train)

In [59]:
# encode data

x_train = tokenizer.texts_to_matrix(train,
                                   mode = 'binary')

x_train.shape

(1800, 44277)

In [62]:
x_test = tokenizer.texts_to_matrix(test,
                                   mode = 'binary')

x_test.shape

(200, 44277)

In [63]:
tokenizer.word_docs

defaultdict(int,
            {'talent': 174,
             'teenage': 75,
             'work': 640,
             'main': 300,
             'host': 29,
             'walking': 70,
             'scenario': 29,
             'sequence': 243,
             'requirements': 9,
             'newly': 24,
             'film': 1553,
             'dad': 60,
             'life': 707,
             'segment': 30,
             'restless': 17,
             'charisma': 40,
             'luck': 53,
             'breakout': 6,
             'drawn': 62,
             'entirely': 102,
             'imogen': 1,
             'impressive': 161,
             'show': 412,
             'doesnt': 734,
             'end': 661,
             'movie': 1372,
             'textbook': 9,
             'along': 398,
             'characters': 900,
             'goofyembarrassing': 1,
             'comedy': 447,
             'radiate': 4,
             'around': 601,
             'arrival': 30,
             'porn': 30,
        

In [64]:
#define network
n_words=x_train.shape[1]
model=define_model(n_words)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 50)                2213900   
                                                                 
 dense_5 (Dense)             (None, 1)                 51        
                                                                 
Total params: 2213951 (8.45 MB)
Trainable params: 2213951 (8.45 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [65]:
model.fit(x_train,np.array(train_labels),batch_size=10,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fa68869c550>