<a href="https://colab.research.google.com/github/Surajpatra700/Deep-Learning_Project/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ********************************************** RECCURENT NUERAL NETWORK(RNN) *******************************************

In [None]:
# USE CASES OF RNN:
# 1. Autocompletion of text
# 2. Language Translation
# 3. NER: Named Entity recognition i.e it can recognize name, comany, time, date... etc from the given text
# 4. Sentiment analysis i.e it is used to give rating to something based upon the text

In [43]:
# WORD EMBEDDING:

# A word embedding is a numerical representation of words in a way that captures their semantic meaning and relationships
# by placing similar words closer together in a multi-dimensional space.
# It's used in natural language processing to enhance the understanding and processing of text data by turning words into vectors
# with context-rich information.

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [None]:
reviews = [
    "nice food",
    "amazing restaurant",
    "too good",
    "just loved it!",
    "will go again",
    "horrible food",
    "never go there",
    "poor service",
    "poor quality",
    "needs improvement"
]

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [None]:
one_hot("amazing restaurant",30) # asigns a value between 0 to 30 to each word present in the text

[16, 8]

In [None]:
encoded_reviews = [one_hot(d,30) for d in reviews]
print(encoded_reviews)

[[26, 21], [16, 8], [29, 13], [15, 29, 25], [1, 9, 15], [17, 21], [23, 9, 1], [27, 13], [27, 11], [6, 8]]


In [None]:
max_length = 3
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
padded_reviews

array([[26, 21,  0],
       [16,  8,  0],
       [29, 13,  0],
       [15, 29, 25],
       [ 1,  9, 15],
       [17, 21,  0],
       [23,  9,  1],
       [27, 13,  0],
       [27, 11,  0],
       [ 6,  8,  0]], dtype=int32)

In [None]:
embeded_vector_size = 4 # embeded_vector_size is set to 4, indicating that each word in the vocabulary will be represented as a 4-dimensional vector.

model = Sequential()
model.add(Embedding(30, embeded_vector_size, input_length=max_length, name="embedding"))
# After we get the embedding vector our next job is to Flatten them
model.add(Flatten())
# Layer after Flatten layer is Sigmoid activation Layer
model.add(Dense(1, activation="sigmoid"))

In [None]:
X = padded_reviews
y = sentiment

In [None]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 4)              120       
                                                                 
 flatten (Flatten)           (None, 12)                0         
                                                                 
 dense (Dense)               (None, 1)                 13        
                                                                 
Total params: 133 (532.00 Byte)
Trainable params: 133 (532.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.fit(X,y,epochs=50, verbose=0)

<keras.src.callbacks.History at 0x7e75e437fe20>

In [None]:
# Evaluate the model

loss, accuracy = model.evaluate(X,y)
accuracy



1.0

In [None]:
weights = model.get_layer("embedding").get_weights()[0]
weights

array([[ 0.00922214,  0.00366741, -0.05300897, -0.01109057],
       [ 0.08126983,  0.01227798, -0.01108428, -0.07547533],
       [-0.04564714,  0.02943698,  0.0342488 ,  0.01049475],
       [-0.03906763,  0.0495939 , -0.02085229,  0.01090698],
       [-0.01771837,  0.03307786, -0.0395572 ,  0.04566247],
       [ 0.03396993,  0.04108464, -0.02212662, -0.03565456],
       [-0.02685024, -0.00999167,  0.06065805,  0.0293018 ],
       [ 0.00060464, -0.00534008, -0.03588202,  0.04528115],
       [ 0.02768254, -0.02508093, -0.05448046,  0.02807391],
       [ 0.00869436, -0.06548239,  0.0459359 , -0.02288868],
       [-0.04712031, -0.04516357,  0.02125481,  0.01902356],
       [ 0.00946911,  0.03405423, -0.03616702, -0.00289804],
       [-0.01028123, -0.00669817,  0.04525627, -0.0213056 ],
       [-0.04940357, -0.06197885,  0.00162871, -0.00339111],
       [ 0.00255262, -0.00198647, -0.02685828,  0.02007722],
       [-0.01943984,  0.10052533,  0.08432616, -0.00069059],
       [ 0.0946813 ,  0.

In [None]:
weights[20]

array([ 0.01275193, -0.00847975,  0.0008108 , -0.01102587], dtype=float32)

In [None]:
weights[16]

array([ 0.0946813 ,  0.08539467, -0.07075801, -0.09737893], dtype=float32)

In [None]:
# ******************************************************** WORD 2 VEC ***********************************

# Word2Vec is a popular natural language processing (NLP) technique that learns distributed representations (word embeddings)
# of words from large text corpora. It uses neural networks to convert words into dense vectors that capture semantic and
# contextual relationships. Word2Vec has two main architectures: Continuous Bag of Words (CBOW) and Skip-gram,
# both of which aim to predict surrounding words given a target word or predict the target word given surrounding words.
# These word embeddings are valuable for various NLP tasks, enabling better understanding of word semantics and improving
# the performance of tasks like text classification, sentiment analysis, and machine translation.

In [44]:
import gensim # gensim is a popular NLP library
import pandas as pd

In [45]:
df = pd.read_json("/content/drive/MyDrive/datasets/Cell_Phones_and_Accessories_5.json", lines=True) # lines = True helps to read all the lines within the json File
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
df.shape

(194439, 9)

In [27]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [29]:
# df['reviewText'] = df["reviewText"].apply(lambda x: x.lower())
# df.reviewText[0]

"they look good and stick good! i just don't like the rounded shape because i was always bumping it and siri kept popping up and it was irritating. i just won't buy a product like this again"

In [46]:
# Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long. Uses ~gensim.utils.tokenize internally.
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again")

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [32]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [33]:
model = gensim.models.Word2Vec(
    window=10, # It will take 10 words before and 10 words after the target word in the window for processing
    min_count=2, # min. words in a window set to 2
    workers=4 # means it will use 4 threads of my CPU
)

In [34]:
# build vocabulary means to build set of unique words in a sentence or grp of sentence

model.build_vocab(review_text, progress_per=1000)

In [35]:
model.epochs

5

In [36]:
model.corpus_count # its the total no. samples present in our dataset

194439

In [37]:
model.train(review_text, total_examples=model.corpus_count, epochs = model.epochs)

(61501005, 83868975)

In [38]:
model.save("./word2vec-Cell_Phones_and_Accessories.model")

In [40]:
model.wv.most_similar("bad") # wv means words2Vec

[('shabby', 0.6789193153381348),
 ('terrible', 0.6707874536514282),
 ('horrible', 0.6224038600921631),
 ('good', 0.578141987323761),
 ('funny', 0.5644428133964539),
 ('crappy', 0.5597448348999023),
 ('lame', 0.5380252003669739),
 ('poor', 0.5373563170433044),
 ('okay', 0.5315224528312683),
 ('sad', 0.5285705327987671)]

In [42]:
# Printing similarity score between 2 words

model.wv.similarity(w1="bad",w2="shabby")

0.67891926