<a href="https://colab.research.google.com/github/Sinha-Abhinav-13/imdb_aiml/blob/rcnn/imdb_rcnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("/content/IMDB_Dataset.csv", quotechar='"',skiprows =[2418, 5505, 7089, 8684,11039,14210,15011,17401,18200,19792,20560,21344,22151,22913,23711,24515,26079,26908,27704,28507,29335,30097,30877,31642,32426,34067, 34832,36434,37224, 38010,39625, 42025,42780,44315, 45096,46680, 47479,49083])
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
import tensorflow as tf

def preprocess_text(text):
    # Convert text to lowercase and remove punctuation
    text = tf.strings.regex_replace(text, "<br\s*/?>", " ")
    text = tf.strings.regex_replace(text, "[^a-zA-Z\s]", "")
    text = tf.strings.lower(text)
    return text

# Apply preprocessing to the dataset
data['review'] = data['review'].apply(lambda x: preprocess_text(x).numpy().decode('utf-8'))

# Display the first few rows of the preprocessed dataset
print(data.head())


                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production   the filming te...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically theres a family where a little boy j...  negative
4  petter matteis love in the time of money is a ...  positive


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['review'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data['review'])

# Pad the sequences
max_length = 256
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Convert labels to numerical values
labels = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

print(padded_sequences[0])


[  27    4    1   75 1928   44 1054   11  100  143   39 3241  386  467
   25 3101   33   22  201   13   10    6  598   47  575   14   68    1
   87  145   11 3218   68   41 3241   12   28 5347    2  132    4  572
   60  283    7  201   34    1  665  137 1688   68   10    6   20    3
  117   15    1 8112 5593   38   10  117 2484   54 5868   14 5322    5
 1435  381   38  572   28    6 3339    7    1  349  345    4    1  665
    8    6  471 3241   13   11    6    1  344    5    1 6675 2496 1051
    8 2624 1366   19  525   32 4678 2477    4    1 1178  113   30    1
 6830   24 2919    2  398   36    6   20  317   19    1 4851 3521  525
    6  338    5 8163 5040 7559 2447    2  322 9021 7223    2 8485   22
  108  226  238    9   56  129    1  273 1280    4    1  117    6  667
    5    1  188   11    8  264  113   75  257  550 2967  809  179 1241
 4266   15 2469 1086  809 1384  809  147  935  181    1   87  386    9
  119  203 3218   68   13   36 1567    8   12 2179    9  397  129    9
   12 

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

# Further split the training set into training and validation sets
train_sequences, val_sequences, train_labels, val_labels = train_test_split(
    train_sequences, train_labels, test_size=0.2, random_state=42
)

print(f"Training samples: {len(train_sequences)}")
print(f"Validation samples: {len(val_sequences)}")
print(f"Testing samples: {len(test_sequences)}")


Training samples: 31975
Validation samples: 7994
Testing samples: 9993


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout

# Build the RCNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Build the model
model.build(input_shape=(None, max_length))

# Summary of the model
model.summary()


In [7]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# EarlyStopping to stop training when validation loss has stopped improving
early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor
    patience=3,          # Number of epochs with no improvement to stop training
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored metric
)

# ModelCheckpoint to save the model when there is an improvement
model_checkpoint = ModelCheckpoint(
    'best_model.keras',  # Filepath to save the model
    monitor='val_loss',  # Metric to monitor
    save_best_only=True  # Save only the model with the best value of the monitored metric
)

# Train the model with callbacks
history = model.fit(
    train_sequences, train_labels,
    epochs=10,  # Set a higher number of epochs for training
    validation_data=(val_sequences, val_labels),
    batch_size=64,
    callbacks=[early_stopping, model_checkpoint]  # Include the callbacks
)


Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 389ms/step - accuracy: 0.5188 - loss: 0.6918 - val_accuracy: 0.4965 - val_loss: 0.6929
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 388ms/step - accuracy: 0.4961 - loss: 0.6935 - val_accuracy: 0.5239 - val_loss: 0.6923
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 388ms/step - accuracy: 0.5271 - loss: 0.6845 - val_accuracy: 0.7686 - val_loss: 0.5194
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 391ms/step - accuracy: 0.8551 - loss: 0.3631 - val_accuracy: 0.8854 - val_loss: 0.2788
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 389ms/step - accuracy: 0.9188 - loss: 0.2173 - val_accuracy: 0.8803 - val_loss: 0.2774
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 391ms/step - accuracy: 0.9469 - loss: 0.1561 - val_accuracy: 0.8820 - val_loss: 0.3026
Epoc

In [8]:
# Evaluate the model
loss, accuracy = model.evaluate(test_sequences, test_labels)
print(f'Test Accuracy: {accuracy:.2f}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 56ms/step - accuracy: 0.8832 - loss: 0.2809
Test Accuracy: 0.89


In [9]:
# Make predictions
new_reviews = ["The movie was fantastic!", "I did not like the movie at all."]
new_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding='post', truncating='post')

predictions = model.predict(new_padded)
# Convert probabilities to labels
labels = ['Positive' if pred >= 0.5 else 'Negaive' for pred in predictions]
for review, pred, label in zip(new_reviews, predictions, labels):
    print(f"Review: '{review}'\nPredicted Sentiment: {label} (Probability: {pred[0]:.2f})\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step
Review: 'The movie was fantastic!'
Predicted Sentiment: Positive (Probability: 0.80)

Review: 'I did not like the movie at all.'
Predicted Sentiment: Negaive (Probability: 0.27)



In [10]:
from sklearn.metrics import classification_report
y_pred = model.predict(test_sequences)
y_pred_labels = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(test_labels, y_pred_labels))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 56ms/step
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      5020
           1       0.90      0.87      0.89      4973

    accuracy                           0.89      9993
   macro avg       0.89      0.89      0.89      9993
weighted avg       0.89      0.89      0.89      9993

