In [8]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [10]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## One Hot Encoding

In [6]:
data = pd.read_csv("/content/DATASET.csv")
data.head()

Unnamed: 0,Review,label
0,"Great music service, the audio is high quality...",POSITIVE
1,Please ignore previous negative rating. This a...,POSITIVE
2,"This pop-up ""Get the best Spotify experience o...",NEGATIVE
3,Really buggy and terrible to use as of recently,NEGATIVE
4,Dear Spotify why do I get songs that I didn't ...,NEGATIVE


In [7]:
sent = data["Review"]

In [12]:
corpus = []
lemmatizer = WordNetLemmatizer()
for i in range(len(sent)):
  rew = re.sub("[^a-zA-Z]"," ",str(sent[i]))
  rew = rew.lower()
  rew = rew.split()
  rew = [lemmatizer.lemmatize(word) for word in rew if word not in stopwords.words("english")]
  rew = " ".join(rew)
  if rew not in corpus:
    corpus.append(rew)

In [13]:
corpus

['great music service audio high quality app easy use also quick friendly support',
 'please ignore previous negative rating app super great give five star',
 'pop get best spotify experience android annoying please let get rid',
 'really buggy terrible use recently',
 'dear spotify get song put playlist shuffle play',
 'player control sometimes disappear reason app restart forgets playing fix issue',
 'love selection lyric provided song listening',
 'still extremely slow changing storage external sd card convinced done purpose spotify know issue done nothing solve time changed sd card faster read write speed samsung brand please add like song never appear search playlist',
 'great app best mp music app ever used one problem play song find song despite app wonderful recommend best',
 'deleting app following reason app failing business model whether streaming service like consumer want pay music fully ad successively upon logging single song much closed app ad number patient way profit 

In [16]:
voc_size = 10000
onehot_repr = [one_hot(word,voc_size) for word in corpus]

In [17]:
onehot_repr[0]

[9175, 5237, 8023, 9622, 1039, 8000, 6589, 9796, 1995, 9287, 4199, 7595, 7939]

## word embedding

In [19]:
max_length = 0
for i in range(len(onehot_repr)):
  if len(onehot_repr[i]) > max_length:
    max_length = len(onehot_repr[i])
print(max_length)

145


### padding

In [20]:
embedded_docs = pad_sequences(onehot_repr,padding="pre",maxlen=max_length)

In [21]:
embedded_docs

array([[   0,    0,    0, ..., 4199, 7595, 7939],
       [   0,    0,    0, ..., 1833, 9591, 1702],
       [   0,    0,    0, ..., 5048,  421, 1363],
       ...,
       [   0,    0,    0, ..., 7578, 3421, 8335],
       [   0,    0,    0, ..., 1345, 4837, 6357],
       [   0,    0,    0, ..., 5237, 2876, 8505]], dtype=int32)

In [22]:
dim = 150

## neural network with embedded layer

In [23]:
model = Sequential()
model.add(Embedding(voc_size,dim,input_length=max_length))
model.compile("adam","mse")



In [24]:
model.summary()

In [25]:
model.predict(embedded_docs[0])

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  


array([[-0.03147097,  0.02162129,  0.03535876, ...,  0.04446328,
         0.04834397, -0.01835736],
       [-0.03147097,  0.02162129,  0.03535876, ...,  0.04446328,
         0.04834397, -0.01835736],
       [-0.03147097,  0.02162129,  0.03535876, ...,  0.04446328,
         0.04834397, -0.01835736],
       ...,
       [-0.01660918, -0.0139712 , -0.01941265, ...,  0.01095706,
         0.01834333, -0.00694867],
       [ 0.00222487, -0.04813289, -0.01382821, ...,  0.01610183,
        -0.0417236 ,  0.03009428],
       [ 0.04976853, -0.01687012,  0.0357174 , ...,  0.03430435,
         0.01692535, -0.0255995 ]], dtype=float32)

In [26]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       9175, 5237, 8023, 9622, 1039, 8000, 6589, 9796, 1995, 9287, 4199,
       7595, 7939], dtype=int32)