In [22]:
# Build a natural language processing (NLP) model to perform sentiment analysis on social media posts or product reviews
# This File is built and run on Google Colab
# Dataset is Amazon Reviews, containting over 36

In [1]:
import bz2
from tqdm import tqdm
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix,classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,SpatialDropout1D,Embedding
from keras.callbacks import ModelCheckpoint
import pickle

In [2]:
!kaggle datasets download -d bittlingmayer/amazonreviews
!unzip amazonreviews.zip

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
 96% 473M/493M [00:03<00:00, 215MB/s]
100% 493M/493M [00:03<00:00, 147MB/s]
Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [4]:
# Reading the text

def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('/content/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('/content/test.ft.txt.bz2')

In [36]:
train_texts = train_texts[:500000]
train_labels = train_labels[:500000]
test_texts = test_texts[:100000]
test_labels = test_labels[:100000]

In [37]:
# Print the data amount

print('Train Length',len(train_texts))
print('Train Label Length',len(train_labels))
print('Test Length',len(test_texts))
print('Test Label Length',len(test_labels))

Train Length 100000
Train Label Length 100000
Test Length 20000
Test Label Length 20000


In [38]:
# Check any train data value

# Label = 0: Negative
# Label = 1: Positive

n=5
print(train_labels[n])
train_texts[n]

1


"an absolute masterpiece: I am quite sure any of you actually taking the time to read this have played the game at least once, and heard at least a few of the tracks here. And whether you were aware of it or not, Mitsuda's music contributed greatly to the mood of every single minute of the whole game.Composed of 3 CDs and quite a few songs (I haven't an exact count), all of which are heart-rendering and impressively remarkable, this soundtrack is one I assure you you will not forget. It has everything for every listener -- from fast-paced and energetic (Dancing the Tokage or Termina Home), to slower and more haunting (Dragon God), to purely beautifully composed (Time's Scar), to even some fantastic vocals (Radical Dreamers).This is one of the best videogame soundtracks out there, and surely Mitsuda's best ever. ^_^"

In [39]:
# Cleaning the data

def clean_text(text):
    # Remove non-alphanumeric characters and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert multiple whitespace characters to a single space
    text = re.sub(r'\s+', ' ', text)
    # Lower Case Text
    text = text.lower()
    return text

In [52]:
# Convert Data to Dataframe

train=pd.DataFrame(train_texts)[0].apply(clean_text)
test=pd.DataFrame(test_texts)[0].apply(clean_text)
train.head()

Unnamed: 0,0
0,stuning even for the nongamer this sound track was beautiful it paints the senery in your mind so well i would recomend it even to people who hate vid game music i have played the game chrono cross but out of all of the games i have ever played it has the best music it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras it would impress anyone who cares to listen
1,the best soundtrack ever to anything im reading a lot of reviews saying that this is the best game soundtrack and i figured that id write a review to disagree a bit this in my opinino is yasunori mitsudas ultimate masterpiece the music is timeless and im been listening to it for years now and its beauty simply refuses to fadethe price tag on this is pretty staggering i must say but if you are going to buy any cd for this much money this is the only one that i feel would be worth every penny
2,amazing this soundtrack is my favorite music of all time hands down the intense sadness of prisoners of fate which means all the more if youve played the game and the hope in a distant promise and girl who stole the star have been an important inspiration to me personally throughout my teen years the higher energy tracks like chrono cross times scar time of the dreamwatch and chronomantique indefinably remeniscent of chrono trigger are all absolutely superb as wellthis soundtrack is amazing music probably the best of this composers work i havent heard the xenogears soundtrack so i cant say for sure and even if youve never played the game it would be worth twice the price to buy iti wish i could give it stars
3,excellent soundtrack i truly like this soundtrack and i enjoy video game music i have played this game and most of the music on here i enjoy and its truly relaxing and peacefulon disk one my favorites are scars of time between life and death forest of illusion fortress of ancient dragons lost fragment and drowned valleydisk two the draggons galdorb home chronomantique prisoners of fate gale and my girlfriend likes zelbessdisk three the best of the three garden of god chronopolis fates jellyfish sea burning orphange dragons prayer tower of stars dragon god and radical dreamers unstealable jeweloverall this is a excellent soundtrack and should be brought by those that like video game musicxander cross
4,remember pull your jaw off the floor after hearing it if youve played the game you know how divine the music is every single song tells a story of the game its that good the greatest songs are without a doubt chrono cross times scar magical dreamers the wind the stars and the sea and radical dreamers unstolen jewel translation varies this music is perfect if you ask me the best it can be yasunori mitsuda just poured his heart on and wrote it down on paper


In [54]:
voc_size = 20000
max_length = 100
tokenizer = Tokenizer(num_words=voc_size)
tokenizer.fit_on_texts(train)

In [55]:
word_index = tokenizer.word_index
with open('/content/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [56]:
# Convert data to sequences for the model

train = tokenizer.texts_to_sequences(train)
train = pad_sequences(train, maxlen=max_length)
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=max_length)

In [57]:
train_lab=np.array([1 if i=='2' else 0 for i in train_labels])
test_lab=np.array([1 if i=='2' else 0 for i in test_labels])

In [58]:
# Define the model, I have used LSTM (Long Short Term Memory) model for NLP Task

model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=64, input_length=max_length))
model.add(LSTM(units=32, return_sequences=True))
model.add(SpatialDropout1D(rate=0.2))
model.add(LSTM(units=32))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 64)           1280000   
                                                                 
 lstm_4 (LSTM)               (None, 100, 32)           12416     
                                                                 
 spatial_dropout1d_2 (Spati  (None, 100, 32)           0         
 alDropout1D)                                                    
                                                                 
 lstm_5 (LSTM)               (None, 32)                8320      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1300769 (4.96 MB)
Trainable params: 1300769 (4.96 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [59]:
# Start training

checkpoint_cb =ModelCheckpoint("amazon_model.h5", save_best_only=True)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(train, train_lab, epochs=1,
                    validation_split=.1,
                    callbacks=[checkpoint_cb])



  saving_api.save_model(


In [60]:
# Print Loss & Accuracy

loss,accuracy = model.evaluate(test, test_lab)
print("Loss:", loss)
print("Accuracy:", accuracy)

Loss: 1.6905813026824035e-05
Accuracy: 1.0


In [None]:
pd.DataFrame(history.history)

In [61]:
prediction=model.predict(test)
y_pred=np.where(prediction>=.5,1,0)
results=pd.DataFrame()
results['actual'], results['pred']=test_lab, y_pred



In [62]:
text = "WARNING: This is the worst book ever written: As most people readers know, Jack Higgins has written many fine action-packed books. However, this book reads like a twelve-year-old wrote it. The characters are so transparent they could be ghosts.The author and his publishing house must really need the money to rip of the public in this manner. This is the last book I read by this author. One can only rest on his laurels for so long. I'm glad this book was handed down to me or I would have written a really nasty review."
text = clean_text(text)
text = tokenizer.texts_to_sequences([text])
text = pad_sequences(text, maxlen=max_length)
prediction = model.predict(text)
y_pred=np.where(prediction>=.5,1,0)
y_pred



array([[0]])