Refer AmazonFoodReview in NLP folder

In [None]:
import sqlite3
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
connection = sqlite3.connect('../../../NLP/AmazonFoodReviews/database.sqlite')

filtered_data = pd.read_sql_query("""
SELECT * 
FROM Reviews 
WHERE Score != 3
""", connection)

In [None]:
def parition(x):
    if x < 3:
        return 'negative'
    else:
        return 'positive'
    
filtered_data['Score'] = filtered_data['Score'].map(parition)

In [None]:
filtered_data.shape

In [None]:
filtered_data.head()

## EDA

### Data Cleaning: Deduplication

In [None]:
filtered_data.isnull().sum()

In [None]:
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

sorted_data.head()

In [None]:
deduplication_data = sorted_data.drop_duplicates(subset={
    'UserId',
    'HelpfulnessNumerator',
    'HelpfulnessDenominator',
    'Score',
    'Time',
    'Summary',
    'Text'
},
keep='first', inplace=False
)

deduplication_data.shape

In [None]:
deduplication_data['Id'].size/filtered_data['Id'].size *100

In [None]:
# Removing rows with HelpfulnessNumerator greater than HelpfulnessDenominator
final_data = deduplication_data[deduplication_data.HelpfulnessNumerator <= deduplication_data.HelpfulnessDenominator]
final_data.shape

In [None]:
final_data['Score'].value_counts()

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
corpus = []
all_positive_words = []
all_negative_words = []

for idx, doc in enumerate(final_data['Text'].values):
    review = re.sub(r'<.*?>', ' ', doc)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
    review = ' '.join(review)
    corpus.append(review)
    
    if final_data['Score'].values[idx] == 'positive':
        all_positive_words.append(review)
    else:
        all_negative_words.append(review)

# RNN Implementation with Two Vectorization Approaches

We'll implement Simple RNN with two different vectorization methods:
1. **Word2Vec**: Pre-trained word embeddings fed into RNN
2. **Embedding Layer**: Keras Embedding layer within the RNN model

Both approaches will be compared for sentiment classification on Amazon Food Reviews.

In [None]:
# Import required libraries for RNN implementation
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
import seaborn as sns

print("TensorFlow Version:", tf.__version__)

In [None]:
# Prepare the data for modeling
# Convert labels to binary (0 for negative, 1 for positive)
y = final_data['Score'].map({'negative': 0, 'positive': 1}).values
X = corpus  # Our preprocessed text data

print(f"Data shape: {len(X)} reviews")
print(f"Positive reviews: {sum(y)}")
print(f"Negative reviews: {len(y) - sum(y)}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set: {len(X_train)} reviews")
print(f"Test set: {len(X_test)} reviews")

## Approach 1: Word2Vec + Simple RNN

In this approach, we'll:
1. Create Word2Vec embeddings from our corpus
2. Convert each review to a sequence of word vectors
3. Feed these sequences into a Simple RNN

In [None]:
# Step 1: Create Word2Vec model
# First, tokenize the sentences for Word2Vec training
tokenized_corpus = [review.split() for review in corpus]

# Train Word2Vec model
print("Training Word2Vec model...")
w2v_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,  # Embedding dimension
    window=5,         # Context window
    min_count=2,      # Minimum word frequency
    workers=4,        # Number of threads
    sg=0,            # CBOW model (sg=1 for Skip-gram)
    epochs=10
)

print(f"Word2Vec model trained with {len(w2v_model.wv.index_to_key)} words")
print(f"Embedding dimension: {w2v_model.wv.vector_size}")

# Check some example word vectors
sample_words = ['good', 'bad', 'delicious', 'terrible', 'amazing']
for word in sample_words:
    if word in w2v_model.wv:
        print(f"'{word}' is in vocabulary")
    else:
        print(f"'{word}' not in vocabulary")

In [None]:
# Step 2: Convert reviews to sequences of word vectors
def text_to_word2vec_sequence(text, model, max_length=100):
    """Convert text to sequence of word vectors"""
    words = text.split()
    vectors = []
    
    for word in words:
        if word in model.wv:
            vectors.append(model.wv[word])
        else:
            # Use zero vector for unknown words
            vectors.append(np.zeros(model.wv.vector_size))
    
    # Pad or truncate to max_length
    if len(vectors) > max_length:
        vectors = vectors[:max_length]
    else:
        # Pad with zero vectors
        while len(vectors) < max_length:
            vectors.append(np.zeros(model.wv.vector_size))
    
    return np.array(vectors)

# Set maximum sequence length
MAX_LEN = 100

# Convert training and test data
print("Converting texts to Word2Vec sequences...")
X_train_w2v = np.array([text_to_word2vec_sequence(text, w2v_model, MAX_LEN) for text in X_train])
X_test_w2v = np.array([text_to_word2vec_sequence(text, w2v_model, MAX_LEN) for text in X_test])

print(f"Training data shape: {X_train_w2v.shape}")
print(f"Test data shape: {X_test_w2v.shape}")
print(f"Each sequence length: {MAX_LEN}")
print(f"Each word vector dimension: {w2v_model.wv.vector_size}")

In [None]:
# Step 3: Build and train Simple RNN model with Word2Vec embeddings
def create_word2vec_rnn_model(input_shape):
    """Create Simple RNN model for Word2Vec embeddings"""
    model = Sequential([
        SimpleRNN(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.3),
        SimpleRNN(32),
        Dropout(0.3),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create model
print("Building Word2Vec RNN model...")
w2v_rnn_model = create_word2vec_rnn_model((MAX_LEN, w2v_model.wv.vector_size))

# Display model architecture
w2v_rnn_model.summary()

In [None]:
# Train the Word2Vec RNN model
print("Training Word2Vec RNN model...")
history_w2v = w2v_rnn_model.fit(
    X_train_w2v, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_w2v.history['loss'], label='Training Loss')
plt.plot(history_w2v.history['val_loss'], label='Validation Loss')
plt.title('Word2Vec RNN - Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_w2v.history['accuracy'], label='Training Accuracy')
plt.plot(history_w2v.history['val_accuracy'], label='Validation Accuracy')
plt.title('Word2Vec RNN - Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

## Approach 2: Keras Embedding Layer + Simple RNN

In this approach, we'll:
1. Use Keras Tokenizer to convert text to integer sequences
2. Use Keras Embedding layer within the RNN model
3. Train embeddings end-to-end with the RNN

In [None]:
# Step 1: Tokenize and create integer sequences
# Initialize tokenizer
vocab_size = 10000  # Maximum number of words to keep
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')

# Fit tokenizer on training data
tokenizer.fit_on_texts(X_train)

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to same length
MAX_LEN_EMB = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_LEN_EMB, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_LEN_EMB, padding='post', truncating='post')

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Using top {vocab_size} words")
print(f"Training data shape: {X_train_padded.shape}")
print(f"Test data shape: {X_test_padded.shape}")

# Show example of tokenization
print(f"\nExample review: {X_train[0][:100]}...")
print(f"Tokenized: {X_train_seq[0][:20]}")
print(f"Padded: {X_train_padded[0][:20]}")

In [None]:
# Step 2: Build Simple RNN model with Embedding layer
def create_embedding_rnn_model(vocab_size, embedding_dim, max_length):
    """Create Simple RNN model with Embedding layer"""
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        SimpleRNN(64, return_sequences=True),
        Dropout(0.3),
        SimpleRNN(32),
        Dropout(0.3),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create model
embedding_dim = 100
print("Building Embedding RNN model...")
emb_rnn_model = create_embedding_rnn_model(vocab_size, embedding_dim, MAX_LEN_EMB)

# Display model architecture
emb_rnn_model.summary()

In [None]:
# Train the Embedding RNN model
print("Training Embedding RNN model...")
history_emb = emb_rnn_model.fit(
    X_train_padded, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_emb.history['loss'], label='Training Loss')
plt.plot(history_emb.history['val_loss'], label='Validation Loss')
plt.title('Embedding RNN - Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_emb.history['accuracy'], label='Training Accuracy')
plt.plot(history_emb.history['val_accuracy'], label='Validation Accuracy')
plt.title('Embedding RNN - Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

## Model Evaluation and Comparison

Let's evaluate both models on the test set and compare their performance.

In [None]:
# Evaluate Word2Vec RNN model
print("=== Word2Vec RNN Model Evaluation ===")
w2v_loss, w2v_accuracy = w2v_rnn_model.evaluate(X_test_w2v, y_test, verbose=0)
w2v_predictions = (w2v_rnn_model.predict(X_test_w2v) > 0.5).astype(int).flatten()

print(f"Test Loss: {w2v_loss:.4f}")
print(f"Test Accuracy: {w2v_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, w2v_predictions, target_names=['Negative', 'Positive']))

# Evaluate Embedding RNN model
print("\n=== Embedding RNN Model Evaluation ===")
emb_loss, emb_accuracy = emb_rnn_model.evaluate(X_test_padded, y_test, verbose=0)
emb_predictions = (emb_rnn_model.predict(X_test_padded) > 0.5).astype(int).flatten()

print(f"Test Loss: {emb_loss:.4f}")
print(f"Test Accuracy: {emb_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, emb_predictions, target_names=['Negative', 'Positive']))

In [None]:
# Create confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Word2Vec RNN confusion matrix
cm1 = confusion_matrix(y_test, w2v_predictions)
sns.heatmap(cm1, annot=True, fmt='d', cmap='Blues', 
           xticklabels=['Negative', 'Positive'],
           yticklabels=['Negative', 'Positive'],
           ax=axes[0])
axes[0].set_title('Word2Vec RNN Confusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Embedding RNN confusion matrix
cm2 = confusion_matrix(y_test, emb_predictions)
sns.heatmap(cm2, annot=True, fmt='d', cmap='Greens', 
           xticklabels=['Negative', 'Positive'],
           yticklabels=['Negative', 'Positive'],
           ax=axes[1])
axes[1].set_title('Embedding RNN Confusion Matrix')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

In [None]:
# Comparison summary
comparison_data = {
    'Model': ['Word2Vec RNN', 'Embedding RNN'],
    'Test Accuracy': [w2v_accuracy, emb_accuracy],
    'Test Loss': [w2v_loss, emb_loss],
    'Approach': ['Pre-trained Word2Vec', 'End-to-end Embedding']
}

comparison_df = pd.DataFrame(comparison_data)
print("=== Model Comparison Summary ===")
print(comparison_df.to_string(index=False))

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Accuracy comparison
axes[0].bar(comparison_df['Model'], comparison_df['Test Accuracy'], 
           color=['skyblue', 'lightgreen'], alpha=0.7)
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_ylabel('Test Accuracy')
axes[0].set_ylim(0, 1)
for i, v in enumerate(comparison_df['Test Accuracy']):
    axes[0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# Loss comparison
axes[1].bar(comparison_df['Model'], comparison_df['Test Loss'], 
           color=['coral', 'lightpink'], alpha=0.7)
axes[1].set_title('Model Loss Comparison')
axes[1].set_ylabel('Test Loss')
for i, v in enumerate(comparison_df['Test Loss']):
    axes[1].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Test predictions on sample reviews
def predict_sentiment(text, model_type='embedding'):
    """Predict sentiment for a given text"""
    if model_type == 'word2vec':
        # Preprocess text same as training data
        processed_text = re.sub(r'<.*?>', ' ', text)
        processed_text = re.sub('[^a-zA-Z]', ' ', processed_text)
        processed_text = processed_text.lower()
        processed_text = processed_text.split()
        processed_text = [lemmatizer.lemmatize(word) for word in processed_text if word not in stop_words]
        processed_text = ' '.join(processed_text)
        
        # Convert to word2vec sequence
        sequence = text_to_word2vec_sequence(processed_text, w2v_model, MAX_LEN)
        sequence = np.expand_dims(sequence, axis=0)
        prediction = w2v_rnn_model.predict(sequence)[0][0]
        
    else:  # embedding
        # Preprocess text same as training data
        processed_text = re.sub(r'<.*?>', ' ', text)
        processed_text = re.sub('[^a-zA-Z]', ' ', processed_text)
        processed_text = processed_text.lower()
        processed_text = processed_text.split()
        processed_text = [lemmatizer.lemmatize(word) for word in processed_text if word not in stop_words]
        processed_text = ' '.join(processed_text)
        
        # Convert to sequence
        sequence = tokenizer.texts_to_sequences([processed_text])
        sequence = pad_sequences(sequence, maxlen=MAX_LEN_EMB, padding='post', truncating='post')
        prediction = emb_rnn_model.predict(sequence)[0][0]
    
    return prediction, 'Positive' if prediction > 0.5 else 'Negative'

# Test on sample reviews
sample_reviews = [
    "This product is absolutely amazing! I love it so much.",
    "Terrible quality, worst purchase ever. Complete waste of money.",
    "It's okay, nothing special but does the job.",
    "Outstanding flavor and excellent packaging. Highly recommended!",
    "Poor quality control and bad customer service."
]

print("=== Sample Predictions ===")
for i, review in enumerate(sample_reviews):
    w2v_score, w2v_sentiment = predict_sentiment(review, 'word2vec')
    emb_score, emb_sentiment = predict_sentiment(review, 'embedding')
    
    print(f"\nReview {i+1}: {review}")
    print(f"Word2Vec RNN:   {w2v_sentiment} (score: {w2v_score:.3f})")
    print(f"Embedding RNN:  {emb_sentiment} (score: {emb_score:.3f})")

## Key Insights and Conclusions

### Approach Comparison:

**1. Word2Vec + RNN:**
- **Pros:**
  - Uses pre-trained semantic relationships
  - Good for capturing word meanings
  - Can leverage external word knowledge
  
- **Cons:**
  - More complex preprocessing
  - Fixed embedding dimension
  - Separate training steps

**2. Embedding Layer + RNN:**
- **Pros:**
  - End-to-end training
  - Simpler preprocessing
  - Embeddings learned specifically for the task
  - More efficient memory usage
  
- **Cons:**
  - Needs sufficient training data
  - No external knowledge
  - May overfit with small datasets

### Performance Notes:
- Both models show competitive performance for sentiment analysis
- The embedding approach is generally more straightforward for most applications
- Word2Vec can be beneficial when you have limited training data or want to leverage external knowledge
- RNN models capture sequential patterns in the text effectively

### Recommendations:
- For production systems, consider using more advanced architectures like LSTM or GRU
- Experiment with different embedding dimensions and RNN units
- Consider using pre-trained embeddings (GloVe, FastText) for better initialization
- Implement proper cross-validation for robust model evaluation