In [None]:
!kaggle datasets download -d tarkkaanko/amazon
!unzip amazon.zip

Dataset URL: https://www.kaggle.com/datasets/tarkkaanko/amazon
License(s): CC-BY-NC-SA-4.0
amazon.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  amazon.zip
replace amazon_reviews.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
# Load the Amazon reviews dataset
df = pd.read_csv('amazon_reviews.csv')

# Print the column names to verify
print(df.columns)

# Drop missing values - adjust column names if necessary based on the output above
df.dropna(subset=['reviewText', 'overall'], inplace=True) # Example: Assuming 'reviewID' and 'overall' are the actual column names

# Convert ratings to sentiment labels (negative: 0, neutral: 1, positive: 2)
def map_sentiment(rating):
    if rating == 3:
        return 1  # Neutral
    elif rating > 3:
        return 2  # Positive
    else:
        return 0  # Negative

df['sentiment'] = df['overall'].apply(map_sentiment) # Example: Using 'overall' to calculate sentiment

# Keep only relevant columns
df = df[['reviewText', 'sentiment']] # Example: Assuming 'reviewText' is the relevant column for reviews

Index(['Unnamed: 0', 'reviewerName', 'overall', 'reviewText', 'reviewTime',
       'day_diff', 'helpful_yes', 'helpful_no', 'total_vote',
       'score_pos_neg_diff', 'score_average_rating', 'wilson_lower_bound'],
      dtype='object')


In [None]:
# Tokenizing and padding text data
max_words = 20000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['reviewText'])

# Convert reviews to sequences
sequences = tokenizer.texts_to_sequences(df['reviewText'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Prepare labels
labels = df['sentiment'].values

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# Build the RNN model
model = tf.keras.models.Sequential()

# Embedding layer
model.add(tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len))

# Simple RNN layer
model.add(tf.keras.layers.SimpleRNN(64))

# Output layer with 3 classes (negative, neutral, positive)
model.add(tf.keras.layers.Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()



In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 55ms/step - accuracy: 0.8514 - loss: 0.5183 - val_accuracy: 0.9125 - val_loss: 0.3498
Epoch 2/5
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.9043 - loss: 0.3433 - val_accuracy: 0.9095 - val_loss: 0.3133
Epoch 3/5
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.9507 - loss: 0.1690 - val_accuracy: 0.9176 - val_loss: 0.2933
Epoch 4/5
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 66ms/step - accuracy: 0.9886 - loss: 0.0567 - val_accuracy: 0.9176 - val_loss: 0.3141
Epoch 5/5
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 61ms/step - accuracy: 0.9980 - loss: 0.0157 - val_accuracy: 0.9166 - val_loss: 0.3552


In [None]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9176 - loss: 0.3105
Test accuracy: 0.9165818691253662


In [None]:
# def predict_sentiment(review):
#     # Convert the review to a sequence
#     sequence = tokenizer.texts_to_sequences([review])
#     padded_sequence = pad_sequences(sequence, maxlen=max_len)

#     # Predict the sentiment
#     prediction = model.predict(padded_sequence)

#     # Return the sentiment class with the highest probability
#     sentiment = np.argmax(prediction)

#     return ['Negative', 'Neutral', 'Positive'][sentiment]

# # Example usage
# review_text = "The phone is amazing, love the battery life!"
# print(predict_sentiment(review_text))  # Output: Positive

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to extract reviews from an Amazon product page and store them in a dataframe
def scrape_amazon_reviews(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    # Send a request to the URL
    response = requests.get(url, headers=headers)

    # Parse the page content
    soup = BeautifulSoup(response.content, 'lxml')

    # Find all reviews
    # If the reviews are under a different tag or class, update the query
    reviews = soup.find_all('div', {'class': 'a-row a-spacing-small review-data'})


    print(f"Found {len(reviews)} reviews")

    review_texts = []
    for review in reviews:
        review_text = review.get_text().strip()
        review_texts.append(review_text)

    # Create a dataframe with the reviews regardless of whether reviews were found or not
    reviews_df = pd.DataFrame(review_texts, columns=['ReviewText'])

    if not review_texts:
        print("No reviews found. The page structure might have changed.")

    return reviews_df

# Example usage with an Amazon product URL (replace with actual URL)
url = 'https://www.amazon.in/Airdopes-141-Playtime-Resistance-Bluetooth/dp/B09N3ZNHTY/ref=s9_acsd_al_ot_c2_x_2_t?_encoding=UTF8&pf_rd_m=A21TJRUUN4KGV&pf_rd_s=merchandised-search-8&pf_rd_r=N3CV8A3HBP0C3G2MBVZ5&pf_rd_p=c1af48d3-bf91-454b-b64e-bfa6ef7d10a0&pf_rd_t=&pf_rd_i=1388921031'  # Example URL, replace with the correct one
reviews_df = scrape_amazon_reviews(url)

# Display the dataframe
print(reviews_df)

Found 13 reviews
                                           ReviewText
0   Pros>Only sound quality is good.Durable. They'...
1   Pros:1. The level of volume it gets to, is jus...
2   I am writing a true review after around a week...
3   I'm writing this review after 2 years of buyin...
4   Sound quality is good. But within 12 months th...
5   It is a very good product and all the features...
6   Great for listening to music or watching a mov...
7   After 1 year one was not working properly\nRea...
8   I have been using boAt headphones for a year n...
9                             Good Quality\nRead more
10  its a amazing product, worth for money, awesom...
11              Good item, I like it 🔥boat\nRead more
12                              i like it!\nRead more


In [None]:
# Assuming model, tokenizer, and max_len are defined and loaded

# Function to predict sentiment of multiple reviews and store them in the dataframe
def predict_sentiment_for_reviews(reviews_df, model, tokenizer, max_len):
    predictions = []

    for review in reviews_df['ReviewText']:
        sequence = tokenizer.texts_to_sequences([review])
        padded_sequence = pad_sequences(sequence, maxlen=max_len)
        prediction = model.predict(padded_sequence)
        sentiment = np.argmax(prediction)
        sentiment_label = ['Negative', 'Neutral', 'Positive'][sentiment]
        predictions.append(sentiment_label)

    # Add the sentiment predictions to the dataframe
    reviews_df['PredictedSentiment'] = predictions
    return reviews_df

# Predict sentiment for the scraped reviews
reviews_with_sentiment = predict_sentiment_for_reviews(reviews_df, model, tokenizer, max_len=100)

# Display the dataframe with reviews and predicted sentiment
print(reviews_with_sentiment)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
                                           ReviewText Predicted