In [75]:
import numpy as np 
import pandas as pd 
import tensorflow as tf

In [81]:

df = pd.read_csv("/kaggle/input/spotify-app-reviews-2022/reviews.csv")


df["label"] = df["Rating"]
df["text"] = df["Review"]

df = df.drop(["Reply", "Rating", "Total_thumbsup", "Time_submitted", "Review"], axis=1)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   61594 non-null  int64 
 1   text    61594 non-null  object
dtypes: int64(1), object(1)
memory usage: 962.5+ KB


Unnamed: 0,label
count,61594.0
mean,3.155989
std,1.673285
min,1.0
25%,1.0
50%,3.0
75%,5.0
max,5.0


In [57]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

ds = tf.data.Dataset.from_tensor_slices(dict(df))

def normalize(input):
    print(input, "\n\n\n")
    return input["text"], input["label"] / 5

ds = ds.map(normalize)
ds = ds.shuffle(BUFFER_SIZE).prefetch(tf.data.AUTOTUNE)

for example, label in ds.take(1):
    print(example, "\n\n\n", label)

{'label': <tf.Tensor 'args_0:0' shape=() dtype=int64>, 'text': <tf.Tensor 'args_1:0' shape=() dtype=string>} 



tf.Tensor(b'Really good experience \xf0\x9f\x91\x8c no cap', shape=(), dtype=string) 


 tf.Tensor(1.0, shape=(), dtype=float64)


In [52]:
train_ds = ds.take(50000).batch(BATCH_SIZE)
remaining = ds.skip(50000)
val_ds = remaining.take(6000).batch(BATCH_SIZE)
test_ds = remaining.skip(6000).batch(BATCH_SIZE)

In [53]:
max_features = 10000

encoder = tf.keras.layers.TextVectorization(max_tokens=max_features)

def text(input, out):
    return input

text_ds = ds.map(text)

encoder.adapt(text_ds)

vocab = np.array(encoder.get_vocabulary())
vocab_size = len(vocab)
print(vocab_size)
vocab[:20]

10000


array(['', '[UNK]', 'the', 'i', 'to', 'and', 'it', 'app', 'a', 'is',
       'music', 'my', 'for', 'of', 'this', 'but', 'you', 'spotify',
       'songs', 'have'], dtype='<U24')

In [54]:
class SpotifyReviewModel(tf.keras.Model):
    def __init__(self):
        super().__init__(self)
        
        self.encoder = encoder
        self.embedding = tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True)
        self.bidirectional = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)
        
    def call(self, inputs):
        x = inputs
        x = self.encoder(x)
        x = self.embedding(x)
        x = self.bidirectional(x)
        x = self.dense1(x)
        x = self.dense2(x)
        return x
    
model = SpotifyReviewModel()

loss = tf.losses.MeanAbsoluteError()

model.compile(loss=loss, optimizer="adam", metrics=["accuracy"])

for x, y in train_ds.take(1):
    print(x[0])
    print(y.numpy()[0], end="\n\n")
    predictions = model([x[0]])
    print(predictions)

tf.Tensor(b"By far the worst music app. Plays the same 10 songs every time you shuffle, ads are beyond annoying, and you can't even view album anymore without paying. Back to pirating", shape=(), dtype=string)
0.2

tf.Tensor([[-0.00370701]], shape=(1, 1), dtype=float32)


In [72]:
EPOCHS = 5

history = model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [73]:
test_loss, test_acc = model.evaluate(test_ds)

print(test_loss, test_acc)

0.10681448876857758 0.3274937570095062


In [80]:
bad = "Really buggy and terrible to use as of recently"
good = "b'Really good experience. Amazing app and I highly reccomend!"
okay = "It was okay there were definetly some problems but overall a decent app. However the not being able to download kind of sucks"

predictions = model.predict(np.array([bad, okay, good]))

print(np.round(predictions * 5))

[[1.]
 [3.]
 [5.]]


In [84]:
reviews = ["Really buggy and terrible to use as of recently", "b'Really good experience. Amazing app and I highly reccomend!"]

predictions = model.predict(np.array(reviews))

# return values to a 1-5 scale
normalized = np.round(predictions * 5)

print(normalized)
# [[1.], [5.]]

[[1.]
 [5.]]
