In [54]:

import tensorflow as tf

import tensorflow.keras as keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [55]:
df_reviews=pd.read_csv("Dataset/IMDB-Dataset.csv")


In [56]:
df_reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [57]:
train_data, test_data, train_labels, test_labels = train_test_split(df_reviews['review'], df_reviews['sentiment'], test_size=0.2, random_state=42)

In [58]:
train_data

39087    That's what I kept asking myself during the ma...
30893    I did not watch the entire movie. I could not ...
45278    A touching love story reminiscent of In the M...
16398    This latter-day Fulci schlocker is a totally a...
13653    First of all, I firmly believe that Norwegian ...
                               ...                        
11284    `Shadow Magic' recaptures the joy and amazemen...
44732    I found this movie to be quite enjoyable and f...
38158    Avoid this one! It is a terrible movie. So wha...
860      This production was quite a surprise for me. I...
15795    This is a decent movie. Although little bit sh...
Name: review, Length: 40000, dtype: object

In [65]:
word_index=pd.read_csv("assets/word_indexes.csv")

In [66]:
word_index.head()

Unnamed: 0,Words,Indexes
0,tsukino,52009
1,nunnery,52010
2,sonja,16819
3,vani,63954
4,woods,1411


In [67]:
word_index=dict(zip(word_index.Words,word_index.Indexes))

In [68]:
word_index["<PAD>"]=0
word_index["<START>"]=1
word_index["<UNK>"]=2
word_index["<UNUSED>"]=3

In [69]:
def review_encoder(text):
  arr=[word_index[word] if word in word_index else word_index[word.lower()] if word.lower() in word_index else 2 for word in text]
  return arr

In [70]:
train_data=train_data.apply(lambda review:review.split())
test_data=test_data.apply(lambda review:review.split())

In [35]:
#train_data,train_labels=imdb_reviews['review'],imdb_reviews['sentiment']
#test_data, test_labels=test_reviews['review'],test_reviews['sentiment']

In [71]:
train_data=train_data.apply(review_encoder)
test_data=test_data.apply(review_encoder)

In [72]:
train_data.head()

39087    [198, 51, 13, 828, 2254, 546, 315, 4, 111, 2, ...
30893    [13, 122, 24, 106, 4, 436, 2, 13, 100, 24, 106...
45278    [6, 1301, 119, 65, 2781, 7, 2, 4, 1310, 18, 2,...
16398    [14, 2, 4959, 2, 9, 6, 484, 4366, 13256, 1951,...
13653    [86, 7, 2, 13, 5995, 264, 15, 9495, 102, 26, 5...
Name: review, dtype: object

In [73]:
def encode_sentiments(x):
  if x=='positive':
    return 1
  else:
    return 0

In [74]:
train_labels=train_labels.apply(encode_sentiments)
test_labels=test_labels.apply(encode_sentiments)

In [75]:
test_labels

33553    1
9427     1
199      0
12447    1
39489    0
        ..
28567    0
25079    1
18707    1
15200    0
5857     1
Name: sentiment, Length: 10000, dtype: int64

In [76]:
train_data=keras.preprocessing.sequence.pad_sequences(train_data,value=word_index["<PAD>"],padding='post',maxlen=500)
test_data=keras.preprocessing.sequence.pad_sequences(test_data,value=word_index["<PAD>"],padding='post',maxlen=500)

In [79]:
model=keras.Sequential([keras.layers.Embedding(100000,16,input_length=500),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(16,activation='relu'),
                        keras.layers.Dropout(0.5),
                        keras.layers.Dense(1,activation='sigmoid')])

In [80]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [81]:
#training the model
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
history = model.fit(train_data, train_labels, epochs=20, batch_size=512, validation_data=(test_data, test_labels), callbacks=[early_stopping])

Epoch 1/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5223 - loss: 0.6923 - val_accuracy: 0.5950 - val_loss: 0.6873
Epoch 2/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5793 - loss: 0.6840 - val_accuracy: 0.6349 - val_loss: 0.6630
Epoch 3/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6318 - loss: 0.6556 - val_accuracy: 0.7648 - val_loss: 0.6055
Epoch 4/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7139 - loss: 0.5974 - val_accuracy: 0.7715 - val_loss: 0.5377
Epoch 5/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7595 - loss: 0.5366 - val_accuracy: 0.8219 - val_loss: 0.4757
Epoch 6/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7884 - loss: 0.4837 - val_accuracy: 0.8405 - val_loss: 0.4262
Epoch 7/20
[1m79/79[0m [32m━━━━

In [82]:
loss,accuracy=model.evaluate(test_data,test_labels)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397us/step - accuracy: 0.8767 - loss: 0.2960


In [83]:
loss,accuracy=model.evaluate(train_data,train_labels)

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 358us/step - accuracy: 0.9178 - loss: 0.2198


In [52]:
index=np.random.randint(1,1000)
user_review=test_reviews.loc[index]
user_review["review"] = "the product is bad."
print(user_review)

review       bad bad bad.
sentiment               0
Name: 138, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_review["review"] = "bad bad bad."


In [75]:
user_review=review_encoder(text)
user_review=np.array([user_review])
user_review = keras.preprocessing.sequence.pad_sequences(user_review,value=0,padding='post',maxlen=500)
if (model.predict(user_review)>0.5).astype("int32"):
  print("positive sentiment")
else:
  print("negative sentiment")

out = model.predict(user_review)
print(out)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
negative sentiment
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[[0.38127437]]


In [84]:
def predict_sentiment_custom(text, model, max_length):
    # Encode the text using the custom encoding function
    encoded_text = review_encoder(text.split())
    # Pad the sequence
    padded_sequence = keras.preprocessing.sequence.pad_sequences([encoded_text], maxlen=max_length, padding='post')
    # Make prediction
    prediction = model.predict(padded_sequence)
    # Convert prediction to human-readable sentiment
    sentiment = "Positive" if prediction[0] >= 0.5 else "Negative"
    return sentiment, prediction[0][0]



In [48]:
def convert_to_scale(score):
    return score * 5

In [85]:
# Example text

text = "the product was really good"

# Predict sentiment for the example text
sentiment,probability = predict_sentiment_custom(text, model, max_length=500)
print("Sentiment:", sentiment)
print("Probability:", probability)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Sentiment: Positive
Probability: 0.67682564


In [51]:
# Convert the score to a scale of 0 to 5
scaled_score = convert_to_scale(probability)
print("Scaled score out of 5:", scaled_score)


Scaled score out of 5: 1.7421455681324005


In [164]:
#model.save("DL Model.h5")
model.save('DL_Model.keras')

In [89]:
from sklearn.metrics import classification_report, accuracy_score

In [91]:
# Make predictions on the test data
predictions = model.predict(test_data)
# Convert predictions to binary values (0 or 1) based on threshold 0.5
predictions = (predictions > 0.5).astype("int32")

# Generate classification report
accuracy_NN = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions)

print(report)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 341us/step
              precision    recall  f1-score   support

           0       0.92      0.83      0.87      4961
           1       0.84      0.93      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

