In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train=pd.read_csv("Dataset/reviews_train.csv")
df_test=pd.read_csv("Dataset/reviews_test.csv")

In [3]:
df_train.head()

Unnamed: 0,review,label
0,In Panic In The Streets Richard Widmark plays ...,1
1,If you ask me the first one was really better ...,0
2,I am a big fan a Faerie Tale Theatre and I've ...,1
3,I just finished reading a book about Dillinger...,0
4,Greg Davis and Bryan Daly take some crazed sta...,0


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=10000)  # You can set num_words based on your vocabulary size
tokenizer.fit_on_texts(df_train['review'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df_train['review'])

In [6]:
# Padding the sequences to a maximum length of 10 (you can choose the length based on your data)
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post')

# Convert to DataFrame to view the padded sequences
padded_df_train = pd.DataFrame(padded_sequences)
print(padded_df_train)

          0     1     2     3     4     5     6    7     8     9
0        46   539   159  5074  4381    81     8    1   389   552
1         8    16    83    28    91    28   191  191   191   161
2         5    29  2078     2    23    29   951    2    52   438
3       506     9    73    57    45    10  1866  329     1   271
4         1   169   746    16     1   169   646    4  2766  4716
...     ...   ...   ...   ...   ...   ...   ...  ...   ...   ...
24995   371    35     1   223   179    97   355   91   317  8217
24996  1712   107   214     8     3  3725   349   17    84   154
24997   179     9  1784     4    16     3   224    4  1119  1226
24998  1680    69    20    11    39   103    30  126   203  2956
24999    12  2479  1219    77  1273    20     5   64    37  4913

[25000 rows x 10 columns]


In [7]:
sequences_test = tokenizer.texts_to_sequences(df_test['review'])
padded_sequences_test = pad_sequences(sequences_test, maxlen=10, padding='post')
padded_df_test = pd.DataFrame(padded_sequences_test)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(10000, 128),
    LSTM(128),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
model.fit(padded_df_train, df_train['label'], epochs=10, batch_size=64, validation_split=0.2)


Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.6321 - loss: 0.6159 - val_accuracy: 0.7182 - val_loss: 0.5421
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.7933 - loss: 0.4377 - val_accuracy: 0.7160 - val_loss: 0.5506
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8423 - loss: 0.3503 - val_accuracy: 0.7044 - val_loss: 0.6126
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8799 - loss: 0.2700 - val_accuracy: 0.7074 - val_loss: 0.7406
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.9088 - loss: 0.2072 - val_accuracy: 0.6956 - val_loss: 0.8681
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.9318 - loss: 0.1632 - val_accuracy: 0.6994 - val_loss: 1.1284
Epoch 7/10
[1m313/313

<keras.src.callbacks.history.History at 0x17d1cc15bb0>

In [10]:
results = model.evaluate(padded_df_test, df_test['label'])
print(f"Test Accuracy: {results[1]}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9801 - loss: 0.0776
Test Accuracy: 0.9273599982261658


In [19]:
df_train['review'][0]

"In Panic In The Streets Richard Widmark plays U.S. Navy doctor who has his week rudely interrupted with a corpse that contains plague. As cop Paul Douglas properly points out the guy died from two bullets in the chest. That's not the issue here, the two of them become unwilling partners in an effort to find the killers and anyone else exposed to the disease.<br /><br />As was pointed out by any number of people, for some reason director Elia Kazan did not bother to cast the small parts with anyone that sounds like they're from Louisiana. Having been to New Orleans where the story takes place I can personally attest to that. Richard Widmark and his wife Barbara Bel Geddes can be excused because as a Navy doctor he could be assigned there, but for those that are natives it doesn't work.<br /><br />But with plague out there and the news being kept a secret, the New Orleans PD starts a dragnet of the city's underworld. The dead guy came off a ship from Europe and he had underworld connect

In [14]:
sample_review = "The movie was best "
rew = tokenizer.texts_to_sequences([sample_review])  # Note the use of a list around the string

# Pad the sequence to the same length used during training
rew_padded = pad_sequences(rew, maxlen=256)  # Use the same maxlen as during training

# Convert the padded sequence to a NumPy array (if not already)
rew_padded = np.array(padded_df_train[0])
prediction = model.predict(rew_padded.reshape(1, -1))
sentiment = "Positive" if prediction > 0.5 else "Negative"
print(f"Sentiment: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Sentiment: Positive


In [47]:
import keras_tuner
from keras_tuner import RandomSearch

def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=10000, 
                        output_dim=hp.Int('embedding_dim', min_value=32, max_value=256, step=32), 
                        input_length=256))
    model.add(LSTM(units=hp.Int('lstm_units', min_value=32, max_value=256, step=32)))
    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='sentiment_analysis'
)

tuner.search(padded_df_train, df_train['label'], epochs=5, validation_split=0.2)


Trial 5 Complete [00h 02m 38s]
val_accuracy: 0.701800008614858

Best val_accuracy So Far: 0.7178000013033549
Total elapsed time: 00h 20m 36s


In [49]:
best_model = tuner.get_best_models(num_models=1)[0]

In [53]:
test_loss, test_accuracy = best_model.evaluate(padded_df_test, df_test['label'])
print(f"Test Accuracy: {test_accuracy}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8114 - loss: 0.4268
Test Accuracy: 0.7933200001716614


In [87]:
sample_review = df_test['review'][10]
rew = tokenizer.texts_to_sequences([sample_review])  # Note the use of a list around the string

# Pad the sequence to the same length used during training
rew_padded = pad_sequences(rew, maxlen=256)  # Use the same maxlen as during training

# Convert the padded sequence to a NumPy array (if not already)
rew_padded = np.array(rew_padded)
prediction = best_model.predict(rew_padded.reshape(1, -1))
sentiment = "Positive" if prediction > 0.5 else "Negative"
print(f"Sentiment: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Sentiment: Positive


In [93]:
best_model.save('best_sentiment_model.h5')



In [95]:
import pickle

In [97]:
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)