# Vanessa Williams
# Week 10

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
import numpy as np
import pandas as pd

# Load the dataset
data_path = '/Users/vanessawilliams/Desktop/Vanessa_Williams/hotel_reviews.csv'
data = pd.read_csv(data_path)

# Preprocessing: Use 'Description' column for reviews and encode 'Is_Response' column
data['Is_Response'] = data['Is_Response'].apply(lambda x: 1 if x == 'not happy' else 0)
reviews = data['Description'].values  # Updated to 'Description'
labels = data['Is_Response'].values

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences, maxlen=100)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Model definition
model = models.Sequential([
    layers.Embedding(input_dim=10000, output_dim=64, input_length=100),
    layers.LSTM(64, return_sequences=True),
    layers.LSTM(32),
    layers.Dense(24, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions (optional)
predictions = model.predict(X_test[:10])
print("Predictions on sample test data:", predictions)

Epoch 1/5




[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 50ms/step - accuracy: 0.7700 - loss: 0.4782 - val_accuracy: 0.8645 - val_loss: 0.3323
Epoch 2/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 48ms/step - accuracy: 0.8855 - loss: 0.2872 - val_accuracy: 0.8655 - val_loss: 0.3244
Epoch 3/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 50ms/step - accuracy: 0.9090 - loss: 0.2359 - val_accuracy: 0.8619 - val_loss: 0.3489
Epoch 4/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 49ms/step - accuracy: 0.9274 - loss: 0.1906 - val_accuracy: 0.8551 - val_loss: 0.3695
Epoch 5/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 49ms/step - accuracy: 0.9396 - loss: 0.1668 - val_accuracy: 0.8504 - val_loss: 0.3917
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8481 - loss: 0.4046
Test Accuracy: 0.8504
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86

# Sentiment Analysis on Hotel Reviews

## Project Overview
In this assignment, we performed sentiment analysis on hotel reviews to classify them as either "happy" or "not happy." We utilized a neural network built with TensorFlow and Keras for this binary classification task.

## Steps and Approach

### 1. Data Preparation
- We loaded the dataset from the specified CSV file.
- The reviews were stored in the `Description` column, and the labels (`Is_Response`) indicated sentiment, with "not happy" reviews labeled as `1` and "happy" reviews labeled as `0`.

### 2. Text Preprocessing
- We tokenized the text data using Keras’s `Tokenizer`, limited to 10,000 words and a maximum sequence length of 100.
- The text was converted into sequences of integers, with padding applied to ensure uniform sequence lengths.

### 3. Model Building
- We built a Sequential neural network model with:
  - An Embedding layer to convert words into dense vector representations.
  - Two LSTM layers to capture sequential dependencies.
  - A Dense layer with ReLU activation for feature extraction.
  - A final Dense layer with sigmoid activation for binary classification.
- We compiled the model using binary cross-entropy loss and the Adam optimizer.

### 4. Training and Evaluation
- The model was trained for 5 epochs, achieving an accuracy of approximately **93.96%** on the training data.
- On the test data, the model reached a validation accuracy of **85.04%** with a final loss of **0.4046**.

### 5. Prediction
- We generated predictions on a sample subset of the test data to observe the model’s output. Each prediction indicates the probability that a review is "not happy."

## Summary of Results
- **Final Test Accuracy**: 85.04%
- **Model Performance**: The model showed strong performance, with high accuracy on both training and validation datasets.
- **Sample Predictions**: The predicted probabilities illustrate the model's ability to distinguish between "happy" and "not happy" sentiments.
