# part 2

## 🔗 Modeling Using LSTM

In this stage, we will build and train an LSTM-based neural network for sentiment analysis on the Amazon Reviews dataset.
We will:
- Preprocess and tokenize the text data.
- Create padded sequences for input into the LSTM.
- Build, train, and evaluate the LSTM model on the labeled sentiment data.


## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer    # Tensorflow un Tokenizer fonksiyonu
from tensorflow.keras.preprocessing.sequence import pad_sequences    # modele vereceğimiz squence lerin aynı boyutta olmasını sağlıyor.

## Read Data

In [None]:
df = pd.read_csv('/kaggle/input/amazon-reviews-csv/amazon_reviews_train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.label.value_counts()

## Tokenization

In [None]:
X = df['text'].values
y = df['label'].values

In [None]:
num_words= 15000

tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")

In [None]:
tokenizer.fit_on_texts(X)

In [None]:
X_num_tokens = tokenizer.texts_to_sequences(X)

In [None]:
X[100]
print(X_num_tokens[100])

In [None]:
num_tokens = [len(tokens) for tokens in X_num_tokens]
num_tokens = np.array(num_tokens)
num_tokens

In [None]:
num_tokens.mean()

In [None]:
num_tokens.max()

In [None]:
num_tokens.argmax() #index

In [None]:
X[3071621]  # we can see the longest text from the index

In [None]:
len(num_tokens)

In [None]:
sum(num_tokens < 166) / len(num_tokens) 

In [None]:
max_tokens = 166

In [None]:
sum(num_tokens < max_tokens) # 3.440.397 text rows have less than 166 tokens

In [None]:
sum(num_tokens > max_tokens)  # 150.880 text rows have more than 166 tokens

## Padding

In [None]:
X_pad = pad_sequences(X_num_tokens, maxlen=max_tokens) #add 0000 or cut

In [None]:
X_pad.shape

In [None]:
np.array(X_num_tokens[3071621]) 

## train test split

In [None]:
from sklearn.model_selection import train_test_split

# Train vs. Temp (80% total)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_pad, y,
    test_size=0.20,  
    stratify=y,
    random_state=42
)


# Temp → Validation & Test 

In [None]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.20,  
    stratify=y_temp,
    random_state=0
)


## Modeling

In [None]:
embedding_size = 50 #for each word

In [None]:
max_tokens

In [None]:
from tensorflow.keras.layers import Embedding, Dropout, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential

model = Sequential()

model.add(Embedding(input_dim=num_words,        
                    output_dim=embedding_size, 
                    input_length = max_tokens))   
 
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(units=48, return_sequences=True)))
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(units=24, return_sequences=True)))
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(units=12)))

model.add(Dense(1, activation='sigmoid'))


In [None]:
optimizer = Adam(learning_rate=0.01) 

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy', 'AUC']
)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_loss",verbose=1, patience = 2, restore_best_weights=True)

In [None]:
model.fit(X_train, y_train,
          epochs=3, 
          batch_size=256,
          validation_data=(X_val, y_val), 
          callbacks=[early_stop])

## Model Evaluation

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle('Training and Validation Metrics', fontsize=16)

if 'accuracy' in model_loss.columns:
    axes[0].plot(model_loss['accuracy'], label='Train Accuracy')
    axes[0].plot(model_loss['val_accuracy'], label='Val Accuracy')
    axes[0].set_title('Accuracy')
    axes[0].legend()

# Loss
axes[1].plot(model_loss['loss'], label='Train Loss')
axes[1].plot(model_loss['val_loss'], label='Val Loss')
axes[1].set_title('Loss')
axes[1].legend()

plt.tight_layout(rect=[0, 0, 1, 0.95])  
plt.show()


In [None]:
loss, accuracy, auc = model.evaluate(X_test, y_test)

In [None]:
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC: {auc:.4f}")

In [None]:
#memory clean
import gc
gc.collect()

In [None]:
from sklearn.metrics import precision_recall_curve,confusion_matrix, classification_report

In [None]:
y_pred_proba = model.predict(X_test)


In [None]:
print(confusion_matrix(y_test, y_pred))
print("-------------------------------------------------------")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


## Save Model and Tokenizer

In [None]:
model.save('review_amazon_sentiment5.h5')

In [None]:
import json
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w') as f:
    f.write(tokenizer_json)

## New text prediction

In [None]:
from tensorflow.keras.models import load_model
model_review = load_model('/kaggle/working/review_amazon_sentiment5.h5')

In [None]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

with open('/kaggle/working/tokenizer.json', 'r') as f:
    data = json.load(f)  

data_str = json.dumps(data)

tokenizer = tokenizer_from_json(data_str)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

reviews = [
    "I hated this product, never buying it again!",
    "Beautiful! Fast shipping and a responsive seller",
    "Garbage product, no one should sell such thing",
    "Great price for a product like this, definitely buying it again"]

tokens = tokenizer.texts_to_sequences(reviews)

tokens_padded = pad_sequences(tokens, maxlen=max_tokens)

pred_probs = model.predict(tokens_padded)

pred_classes = (pred_probs > 0.5).astype(int)

for i, review in enumerate(reviews):
    sentiment = "n" if pred_classes[i][0] == 1 else "p"
    print(f"Review {i+1}: {sentiment} (Confidence: {pred_probs[i][0]:.3f})")
