# Day 3 – Model Training: Naive Bayes vs. LSTM

**Goal:** Train and evaluate both traditional ML (Naive Bayes) and deep learning (LSTM) models on cleaned product review data.


In [2]:
import pandas as pd
import numpy as np
import pickle
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


## Load Cleaned Data

We’ll use the output from Day 2 (`cleaned_reviews.json`) and convert sentiment scores into categories:
- 1, 2 → Negative (0)
- 3 → Neutral (1)
- 4, 5 → Positive (2)


In [3]:
with open("../data/cleaned_reviews.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

def map_sentiment(score):
    if score <= 2:
        return 0  # Negative
    elif score == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['label'] = df['Score'].apply(map_sentiment)
df = df[['clean_text', 'label']]
df.head()


Unnamed: 0,clean_text,label
0,I buy several vitality dog food product find g...,2
1,product arrived label jumbo salt peanutsthe pe...,0
2,confection around century light pillowy citrus...,2
3,look secret ingredient robitussin I believe I ...,0
4,great taffy great price wide assortment yummy ...,2


## Split Data

We’ll split into 80% training and 20% testing sets.


In [4]:
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


## Naive Bayes Model (TF-IDF)

We’ll use `TfidfVectorizer` to convert text into features and train a `MultinomialNB` classifier.


In [5]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')
print("Naive Bayes F1 Score:", f1_nb)
print(classification_report(y_test, y_pred_nb))


Naive Bayes F1 Score: 0.7513976824256305
              precision    recall  f1-score   support

           0       0.83      0.24      0.37     16407
           1       0.62      0.00      0.00      8528
           2       0.81      1.00      0.89     88756

    accuracy                           0.81    113691
   macro avg       0.76      0.41      0.42    113691
weighted avg       0.80      0.81      0.75    113691



In [6]:
with open("../models/naive_bayes.pkl", "wb") as f:
    pickle.dump(nb_model, f)


## LSTM Model (Deep Learning)

We’ll use Keras to build a simple LSTM model using tokenized sequences.


In [7]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


In [8]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=2)

model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.1, callbacks=[early_stop])




Epoch 1/5
[1m6396/6396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m551s[0m 85ms/step - accuracy: 0.7890 - loss: 0.6332 - val_accuracy: 0.8595 - val_loss: 0.3941
Epoch 2/5
[1m6396/6396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m519s[0m 81ms/step - accuracy: 0.8616 - loss: 0.3856 - val_accuracy: 0.8753 - val_loss: 0.3404
Epoch 3/5
[1m6396/6396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 90ms/step - accuracy: 0.8818 - loss: 0.3245 - val_accuracy: 0.8833 - val_loss: 0.3236
Epoch 4/5
[1m6396/6396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m595s[0m 93ms/step - accuracy: 0.8932 - loss: 0.2941 - val_accuracy: 0.8862 - val_loss: 0.3180
Epoch 5/5
[1m6396/6396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 97ms/step - accuracy: 0.9021 - loss: 0.2715 - val_accuracy: 0.8893 - val_loss: 0.3145


<keras.src.callbacks.history.History at 0x2692083fa30>

In [9]:
y_pred_lstm = model.predict(X_test_pad)
y_pred_lstm_final = np.argmax(y_pred_lstm, axis=1)

f1_lstm = f1_score(y_test, y_pred_lstm_final, average='weighted')
print("LSTM F1 Score:", f1_lstm)
print(classification_report(y_test, y_pred_lstm_final))


[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 19ms/step
LSTM F1 Score: 0.8795013777398348
              precision    recall  f1-score   support

           0       0.76      0.76      0.76     16407
           1       0.60      0.34      0.43      8528
           2       0.93      0.96      0.94     88756

    accuracy                           0.89    113691
   macro avg       0.76      0.69      0.71    113691
weighted avg       0.88      0.89      0.88    113691



In [10]:
model.save("../models/lstm_model.h5")




In [11]:
results = {
    "Naive Bayes": round(f1_nb, 4),
    "LSTM": round(f1_lstm, 4)
}

df_result = pd.DataFrame([results])
df_result.to_csv("../results/f1_scores.csv", index=False)

print(df_result)


   Naive Bayes    LSTM
0       0.7514  0.8795


## ✅ Summary

- Trained Naive Bayes with TF-IDF features.
- Built an LSTM model with embeddings and sequences.
- Saved both models and logged F1-scores in `results/f1_scores.csv`.
