In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import types

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional

In [4]:
# Printing the module version of all the imports
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val
            
for module in imports():
    try:
        print(f"{module.__name__} : {module.__version__}")
    except:
        pass

pandas : 1.2.4
numpy : 1.19.5
tensorflow : 2.3.0
tensorflow.keras : 2.4.0


In [5]:
# Reading the dataset
dataset = pd.read_csv('googleplaystore_user_reviews.csv')
dataset.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [6]:
dataset.shape

(64295, 5)

Since we are only interseted in the review and want to build a sentiment classifier on it, We will only take the ```Translated_review``` column as the training feature. 

The target column is the sentiment column.

In [7]:
dataset.isnull().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

A lot of the rows are null but since we cant impute reviews made by people we can only drop these rows.

In [8]:
X = dataset.dropna()['Translated_Review']

In [9]:
dataset.dropna()['Sentiment'].value_counts()

Positive    23998
Negative     8271
Neutral      5158
Name: Sentiment, dtype: int64

One hot encoding the `Sentiment` column into binary form.
- 0 -> Negetive <br>
- 1 -> Positive
<br>

Since we are only intersed in making the app better, we take the neutral reviews as negetive reviews.

In [10]:
y =  dataset.dropna()['Sentiment'] == 'Positive'

In [11]:
# viewing the features
for idx, review in enumerate(X[:5]):
    print(f'Review #{idx+1}: "{review}"')

Review #1: "I like eat delicious food. That's I'm cooking food myself, case "10 Best Foods" helps lot, also "Best Before (Shelf Life)""
Review #2: "This help eating healthy exercise regular basis"
Review #3: "Works great especially going grocery store"
Review #4: "Best idea us"
Review #5: "Best way"


The review contains a lot of symbols like (. , \ /). This are valuable and hold meaning in English but are of no use or will hinder in the tokenizing the data. We will remove these from all the reviews

In [15]:
X = X.str.replace(r'[.,\/#!$%\^&\*;:{}=\-_`~()]',"", regex=True)
X = X.str.lower()   # Converting everyting to lower case
X

0        i like eat delicious food that's i'm cooking f...
1          this help eating healthy exercise regular basis
3               works great especially going grocery store
4                                             best idea us
5                                                 best way
                               ...                        
64222    most ads older many agents not much owner post...
64223    if photos posted portal load fit purpose i'm s...
64226    dumb app i wanted post property rent give opti...
64227    i property business got link sms happy perform...
64230    useless app i searched flats kondapur hyderaba...
Name: Translated_Review, Length: 37427, dtype: object

In [21]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(X)

max_len = max([len(s.split()) for s in X])

vocab_size = len(tokenizer.word_index) + 1
x_token = tokenizer.texts_to_sequences(X)

x_pad = pad_sequences(x_token, maxlen=max_len, padding='post')

In [22]:
model = Sequential()

model.add(Embedding(vocab_size, 50, input_length=max_len))
model.add(Bidirectional(LSTM(64, dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 345, 50)           1273150   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               58880     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,332,159
Trainable params: 1,332,159
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.fit(x_pad, y, batch_size=128, epochs=3, validation_split=0.2, shuffle=True)

Epoch 1/3
 38/234 [===>..........................] - ETA: 5:44 - loss: 0.6540 - accuracy: 0.6367

KeyboardInterrupt: 

In [26]:
def predict_sentiment(reviews):
    if isinstance(reviews, str):
        reviews = [reviews]
    test_token = tokenizer.texts_to_sequences(reviews)
    test_pad = pad_sequences(test_token, maxlen=max_len, padding='post')
    
    predictions = model.predict(test_pad)
    
    for prediction in predictions:
        print(f"{'Positive' if prediction>0.5 else 'Negetive'} with suretiy of {prediction}")

review = "The app is very bad good."
predict_sentiment(review)

Positive with suretiy of [0.70087314]
