In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split

In [5]:
# Load dataset
file_path = "/kaggle/input/jan-movie/movie.csv"
df = pd.read_csv(file_path)

In [6]:
# Prepare text and labels
texts = df["text"].astype(str).values
labels = df["label"].values

In [7]:
# Tokenization
max_words = 20000  # Vocabulary size
max_len = 200      # Max sequence length
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)
y = np.array(labels)

In [8]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (32000, 200)
X_test shape: (8000, 200)
y_train shape: (32000,)
y_test shape: (8000,)


In [10]:
print("Sample training text sequence:", X_train[0])
print("Corresponding training label:", y_train[0])


Sample training text sequence: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0    11   288    10   234   317     3   174    12  1875   198
    87    72    11   428    10    47     7   138    42    12    17    52
     2    17    14   216   122    11    14    42     6  1463    11    59
  2238   379  1623   484     6    58   353    45   398   278   310    11
  1024    18   144    36   132    13    48   514     5     2    17    80
    23   162    53   792    16   474   197   281   103    24   462  1038
     3     9   396  1722  3080   262  2142     7    79     4   541     5
  3080    11  1701  

In [11]:
# Build RNN-CNN Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=True),
    Conv1D(64, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [12]:
# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
# Train model RNN_CNN
epochs = 5
batch_size = 32
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)

Epoch 1/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 97ms/step - accuracy: 0.7345 - loss: 0.4914 - val_accuracy: 0.8733 - val_loss: 0.3016
Epoch 2/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 96ms/step - accuracy: 0.9252 - loss: 0.1997 - val_accuracy: 0.8838 - val_loss: 0.2915
Epoch 3/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 96ms/step - accuracy: 0.9619 - loss: 0.1172 - val_accuracy: 0.8799 - val_loss: 0.3209
Epoch 4/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 96ms/step - accuracy: 0.9777 - loss: 0.0656 - val_accuracy: 0.8791 - val_loss: 0.4106
Epoch 5/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 97ms/step - accuracy: 0.9898 - loss: 0.0336 - val_accuracy: 0.8649 - val_loss: 0.7401


<keras.src.callbacks.history.History at 0x7ef97fd33910>

In [17]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.8663 - loss: 0.7260
Test Loss: 0.7401
Test Accuracy: 0.8649


In [22]:
y_pred_probs = model.predict(X_test)



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step
