In [93]:
import csv
import pandas as pd
import numpy as np
import os
import tensorflow as tf

In [94]:
def read_reviews_from_file(file_path):
    with open(file_path, 'r') as file:
        reviews = file.readlines()
    return [line.strip().split("\t") for line in reviews]

**Fetch all the reviews from these three files and add it in one list named reviews.**

In [95]:
current_directory = os.getcwd()

imdb_review = os.path.join(current_directory, "imdb_labelled.txt")
yelp_review = os.path.join(current_directory, "yelp_labelled.txt")
amazon_review = os.path.join(current_directory, "amazon_cells_labelled.txt")

reviews = []
reviews.extend(read_reviews_from_file(imdb_review))
reviews.extend(read_reviews_from_file(yelp_review))
reviews.extend(read_reviews_from_file(amazon_review))

In [96]:
csv_file = "reviews.csv"
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Review", "Label"])
    writer.writerows(reviews)

print("Reviews written to : ",csv_file)

Reviews written to :  reviews.csv


In [116]:
reviews = pd.read_csv(csv_file)
reviews

Unnamed: 0,Review,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
2995,The screen does get smudged easily because it ...,0
2996,What a piece of junk.. I lose more calls on th...,0
2997,Item Does Not Match Picture.,0
2998,The only thing that disappoint me is the infra...,0


In [117]:
total_row = reviews.shape[0]
count_null = reviews['Label'].isnull().sum()
filled_rows = total_row - count_null
if total_row==filled_rows:
    print("No Null Values")
else:
    print("Null Values!!! Kindly perform some preprocessing")

No Null Values


In [109]:
from tensorflow.keras.layers import SimpleRNN,Dense,Flatten
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.utils import pad_sequences

1.   Tokenization
2.   Convert text to int
3.   Generate sequence of integer for each sentences
4.   check the index of each word

In [118]:
tokenizer = Tokenizer(oov_token="out_of_vocabulary")

In [119]:
tokenizer.fit_on_texts(reviews['Review'])
reviews['Review'] = tokenizer.texts_to_sequences(reviews['Review'])
tokenizer.word_index

{'out_of_vocabulary': 1,
 'the': 2,
 'and': 3,
 'i': 4,
 'a': 5,
 'is': 6,
 'it': 7,
 'to': 8,
 'this': 9,
 'of': 10,
 'was': 11,
 'in': 12,
 'for': 13,
 'not': 14,
 'that': 15,
 'with': 16,
 'my': 17,
 'very': 18,
 'good': 19,
 'on': 20,
 'great': 21,
 'you': 22,
 'but': 23,
 'have': 24,
 'movie': 25,
 'are': 26,
 'as': 27,
 'so': 28,
 'phone': 29,
 'film': 30,
 'be': 31,
 'all': 32,
 'one': 33,
 'had': 34,
 'at': 35,
 'food': 36,
 'like': 37,
 'just': 38,
 "it's": 39,
 'place': 40,
 'time': 41,
 'service': 42,
 'an': 43,
 'were': 44,
 'if': 45,
 'from': 46,
 'really': 47,
 'bad': 48,
 'there': 49,
 'they': 50,
 'we': 51,
 'well': 52,
 'out': 53,
 'has': 54,
 'about': 55,
 'would': 56,
 'or': 57,
 'no': 58,
 'your': 59,
 'only': 60,
 'best': 61,
 "don't": 62,
 'by': 63,
 'even': 64,
 'ever': 65,
 'here': 66,
 'up': 67,
 'also': 68,
 'will': 69,
 'back': 70,
 'when': 71,
 'me': 72,
 'more': 73,
 'than': 74,
 'quality': 75,
 'go': 76,
 'what': 77,
 'love': 78,
 'he': 79,
 'can': 80,
 "i

In [120]:
reviews.head()

Unnamed: 0,Review,Label
0,"[5, 18, 18, 18, 231, 747, 2268, 25, 55, 5, 226...",0
1,"[14, 290, 106, 11, 73, 477, 2, 748, 137, 57, 2...",0
2,"[2271, 2272, 16, 278, 349, 3, 654, 201, 1488, ...",0
3,"[18, 139, 242, 57, 303, 8, 1094, 10]",0
4,"[2, 61, 304, 12, 2, 25, 11, 71, 2274, 6, 479, ...",1


In [57]:
train_set, test_set = train_test_split(reviews,random_state=42,test_size=0.2)

In [58]:
X_train,Y_train = train_set['Review'],train_set['Label']
X_test,Y_test = test_set['Review'],test_set['Label']

In [59]:
max_len_review = max(len(review) for review in X_train)
min_len_review = min(len(review) for review in X_train)

print("Maximum length of review:", max_len_review)
print("Minimum length of review:", min_len_review)

Maximum length of review: 73
Minimum length of review: 1


In [78]:
max_words = 60
X_train = pad_sequences(X_train,padding='post', maxlen=max_words)
X_test = pad_sequences(X_test,padding='post', maxlen=max_words)

In [86]:
X_valid, Y_valid = X_train[:256], Y_train[:256]
X_train_, Y_train_ = X_train[256:], Y_train[256:]

In [84]:
model = Sequential()
model.add(SimpleRNN(32,input_shape=(60,1),activation="tanh",return_sequences=False))
model.add(Dense(1,activation="sigmoid"))
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_5 (SimpleRNN)    (None, 32)                1088      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1121 (4.38 KB)
Trainable params: 1121 (4.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [87]:
model.compile( loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train,batch_size=64,epochs=5,verbose=1,validation_data=(X_valid, Y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7bbc08c05cf0>

In [88]:
model.evaluate(X_test, Y_test, verbose=0)

[0.6923514008522034, 0.49666666984558105]