<a href="https://colab.research.google.com/github/SamhithVkk/dl-project/blob/main/dl-project-sentiment-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = '515k-hotel-reviews-data-in-europe1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4674508%2F7949045%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240424%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240424T155817Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D422421ceb8f2659e01e32baca55a7733b33ed3784db5d4388ccb1ec860fb635998d6a71d4ec4f3c1223b2056d29899763550fe57ed0f74b82d126e035279132a0b1c0a966d9d1238f70dbc9f086c46e589071cc3472a34ea2f3df61a94125eb093b1df8310cae85fbea0998067304198bfddfb895018f3cba83e6e013bdfee9e976e98eb47ff6837dfeee8d5df4242ae5a3ee1765c2bd52604735028ea0314352a9a224ce7c56b9d7dfca4941e4ff632ef636cd1de4e838b48f9682e2285b5dac2f14a866a93e59915dd63ee8393d4c0e57de71648567684e8095b93cfb549db165176529f1cae425db1a87d42e1f595a4157797ab129d5e1b7d0ea4b9996af9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
import string

# Function to clean text
def clean_text(text):
    text = text.lower()  # Lowercase the text
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = ''.join([char for char in text if not char.isdigit()])  # Remove digits
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

# Read the data
reviews_df = pd.read_csv("/kaggle/input/515k-hotel-reviews-data-in-europe1/Hotel_Reviews.csv")

# Create the review text and label
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
reviews_df["sentiment"] = np.where(reviews_df["Reviewer_Score"] < 5, 1, 0)  # 1 for negative, 0 for positive

# Clean the text data
reviews_df["review_clean"] = reviews_df["review"].apply(clean_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews_df["review_clean"], reviews_df["sentiment"], test_size=0.2, random_state=42)

# Tokenize the text data
max_features = 10000  # Maximum number of words to keep based on frequency
tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
maxlen = 100  # Maximum length of sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# Define the model
embedding_dim = 100
model = Sequential([
    Embedding(input_dim=max_features, output_dim=embedding_dim),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
epochs = 10
batch_size = 64
history = model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


2024-03-27 03:46:22.850577: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 03:46:22.850676: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 03:46:22.980223: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m927s[0m 143ms/step - accuracy: 0.9572 - loss: 0.1609 - val_accuracy: 0.9641 - val_loss: 0.1017
Epoch 2/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m921s[0m 143ms/step - accuracy: 0.9651 - loss: 0.0959 - val_accuracy: 0.9648 - val_loss: 0.0967
Epoch 3/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m917s[0m 142ms/step - accuracy: 0.9671 - loss: 0.0893 - val_accuracy: 0.9650 - val_loss: 0.0986
Epoch 4/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m916s[0m 142ms/step - accuracy: 0.9695 - loss: 0.0820 - val_accuracy: 0.9646 - val_loss: 0.0980
Epoch 5/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m944s[0m 146ms/step - accuracy: 0.9703 - loss: 0.0805 - val_accuracy: 0.9640 - val_loss: 0.0993
[1m3224/3224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 42ms/step - accuracy: 0.9647 - loss: 0.0966
Test Loss: 0.0967472642660141
Test Accuracy: 0

In [None]:

# Input review
input_review = "This hotel is bad and it is so uncomfortable but it is good for the price they have given than other hotels"

# Clean the input review
cleaned_review = clean_text(input_review)

# Tokenize and pad the cleaned review text
review_seq = tokenizer.texts_to_sequences([cleaned_review])
review_pad = pad_sequences(review_seq, maxlen=maxlen, padding='post')

# Predict sentiment
prediction = model.predict(review_pad)[0][0]

# Output result
if prediction >= 0.5:
    print("The review is predicted to be negative.")
else:
    print("The review is predicted to be positive.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
The review is predicted to be negative.
