In [15]:
import pandas as pd

# File path
train_file_path = "train.tsv"

# Load dataset
df = pd.read_csv(train_file_path, sep="\t")

# Display first 5 rows
df.head()


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [17]:
import re
import string

# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text

# Apply on dataset
df['Phrase'] = df['Phrase'].apply(preprocess_text)

# Check cleaned text
df.head()


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,2,1,a series of escapades demonstrating the adage ...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2


In [19]:
from sklearn.model_selection import train_test_split

X = df["Phrase"]
y = df["Sentiment"]

# Split dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training Set: {len(X_train)}")
print(f"Validation Set: {len(X_val)}")
print(f"Test Set: {len(X_test)}")


Training Set: 109242
Validation Set: 23409
Test Set: 23409


In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert to sequences
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100)


In [23]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Build model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')  # 5 sentiment classes
])

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history = model.fit(X_train_seq, y_train, validation_data=(X_val_seq, y_val), epochs=10, batch_size=64)


Epoch 1/10




[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 97ms/step - accuracy: 0.5583 - loss: 1.1115 - val_accuracy: 0.6540 - val_loss: 0.8491
Epoch 2/10
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 97ms/step - accuracy: 0.6847 - loss: 0.7727 - val_accuracy: 0.6715 - val_loss: 0.8097
Epoch 3/10
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 100ms/step - accuracy: 0.7173 - loss: 0.6805 - val_accuracy: 0.6728 - val_loss: 0.8133
Epoch 4/10
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 101ms/step - accuracy: 0.7428 - loss: 0.6152 - val_accuracy: 0.6762 - val_loss: 0.8224
Epoch 5/10
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 101ms/step - accuracy: 0.7608 - loss: 0.5640 - val_accuracy: 0.6695 - val_loss: 0.8593
Epoch 6/10
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 102ms/step - accuracy: 0.7755 - loss: 0.5251 - val_accuracy: 0.6697 - val_loss: 0.9073
Epoch

In [25]:
# Evaluate model
test_loss, test_acc = model.evaluate(X_test_seq, y_test)
print(f"Test Accuracy: {test_acc:.4f}")


[1m732/732[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 20ms/step - accuracy: 0.6543 - loss: 1.2484
Test Accuracy: 0.6510


In [27]:
from tensorflow.keras.layers import Bidirectional

# Build Optimized Model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=100),
    Bidirectional(LSTM(256, return_sequences=True)),  # Increase LSTM units & Bidirectional
    Dropout(0.2),  # Reduce dropout
    LSTM(128),
    Dense(128, activation='relu'),
    Dense(5, activation='softmax')  # 5 sentiment classes
])

# Compile Model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model (More Epochs)
history = model.fit(X_train_seq, y_train, validation_data=(X_val_seq, y_val), epochs=15, batch_size=64)

# Evaluate Model
test_loss, test_acc = model.evaluate(X_test_seq, y_test)
print(f"Optimized Test Accuracy: {test_acc:.4f}")




Epoch 1/15
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m573s[0m 333ms/step - accuracy: 0.5580 - loss: 1.1037 - val_accuracy: 0.6540 - val_loss: 0.8403
Epoch 2/15
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m558s[0m 327ms/step - accuracy: 0.6891 - loss: 0.7560 - val_accuracy: 0.6712 - val_loss: 0.8008
Epoch 3/15
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 338ms/step - accuracy: 0.7229 - loss: 0.6649 - val_accuracy: 0.6741 - val_loss: 0.8119
Epoch 4/15
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m560s[0m 328ms/step - accuracy: 0.7458 - loss: 0.5992 - val_accuracy: 0.6741 - val_loss: 0.8433
Epoch 5/15
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 329ms/step - accuracy: 0.7678 - loss: 0.5416 - val_accuracy: 0.6678 - val_loss: 0.8959
Epoch 6/15
[1m1707/1707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 329ms/step - accuracy: 0.7852 - loss: 0.4919 - val_accuracy: 0.6630 - val_loss:

In [29]:
model.save("sentiment_model.h5")




In [31]:
def predict_sentiment(review):
    review_seq = pad_sequences(tokenizer.texts_to_sequences([review]), maxlen=100)
    prediction = model.predict(review_seq)
    return prediction.argmax()

print(predict_sentiment("This movie was amazing!"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357ms/step
4


In [33]:
import pickle

# Tokenizer ko pickle file me save karna
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [2]:
model = Sequential([
    ...,
    Dense(3, activation='softmax')  # <-- Check this number!
])


NameError: name 'Sequential' is not defined

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [6]:
from tensorflow.keras.models import load_model

model = load_model("sentiment_model.h5")
model.summary()


