In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tflearn

In [4]:
df = pd.read_csv('ign.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11


In [5]:
df=df.drop(['url','release_year','release_month','release_day','editors_choice','Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,score_phrase,title,platform,score,genre
0,Amazing,LittleBigPlanet PS Vita,PlayStation Vita,9.0,Platformer
1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,PlayStation Vita,9.0,Platformer
2,Great,Splice: Tree of Life,iPad,8.5,Puzzle
3,Great,NHL 13,Xbox 360,8.5,Sports
4,Great,NHL 13,PlayStation 3,8.5,Sports


In [6]:
df.loc[:, ('positive')] = (df.score > 5.).astype(int)
df.loc[:, ('negative')] = (df.score <= 5.).astype(int)
df.head()

Unnamed: 0,score_phrase,title,platform,score,genre,positive,negative
0,Amazing,LittleBigPlanet PS Vita,PlayStation Vita,9.0,Platformer,1,0
1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,PlayStation Vita,9.0,Platformer,1,0
2,Great,Splice: Tree of Life,iPad,8.5,Puzzle,1,0
3,Great,NHL 13,Xbox 360,8.5,Sports,1,0
4,Great,NHL 13,PlayStation 3,8.5,Sports,1,0


In [75]:
df.title[1]

'LittleBigPlanet PS Vita -- Marvel Super Hero Edition'

In [123]:
from tflearn.data_utils import to_categorical
from collections import Counter

maxLen = 0
total_counts = Counter()
for title in df.title:
    maxLen = len(title) if maxLen < len(title) else maxLen
    for word in title.split(" "):
        total_counts[word]  += 1

print("total words in data set: ", len(total_counts))

total words in data set:  9924


In [131]:
tmp = total_counts.most_common()[:3]
for tpl in tmp:
    word = tpl[0]
    print("deleting word '{}'".format(word))
    del total_counts[word]

deleting word 'of'
deleting word 'The'
deleting word 'the'


In [132]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[-1], ': ', total_counts[vocab[-1]])

word2idx = { term:(idx+1) for idx, term in enumerate(list(vocab))}

Hallway :  1


In [137]:
def text_to_vector(text):
    retVal = np.zeros((maxLen), dtype=np.int_)
    for idx, word in enumerate(text.split(" ")):
        if word in word2idx.keys():
            retVal[idx] = word2idx[word]
    return(retVal)

In [138]:
word_vectors = np.zeros((len(df), maxLen), dtype=np.int_)
for i, (_, text) in enumerate(df.iterrows()):
    word_vectors[i] = text_to_vector(text[1])
len(word_vectors)

18625

In [139]:
word_vectors[0]

array([1441, 4470, 3661,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0])

In [140]:
labels = df.loc[:, ['positive']]
labels.head()

Unnamed: 0,positive
0,1
1,1
2,1
3,1
4,1


In [144]:
idx2word = {v:k for k, v in word2idx.items()}
txt = ""
for idx in word_vectors[1]:
    if idx < 1:
        break
    txt += idx2word[idx] + " "
txt

'LittleBigPlanet PS Vita -- Marvel Super Hero Edition '

In [145]:
assert(len(word_vectors)==len(labels))

In [146]:
n_records = len(labels)
n_shuffle = np.arange(n_records)
np.random.shuffle(n_shuffle)
test_fraction = 0.9

n_train_split, n_test_split = n_shuffle[:int(n_records*test_fraction)], n_shuffle[int(n_records*test_fraction):]
trainX, trainY = word_vectors[n_train_split,:], to_categorical(labels.values[n_train_split], 2)
testX, testY = word_vectors[n_test_split,:], to_categorical(labels.values[n_test_split], 2)

In [147]:
len(word_vectors)

18625

In [148]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    # layers
    net = tflearn.input_data([None, maxLen])
    net = tflearn.embedding(net, input_dim=len(word_vectors), output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, len(trainY[0]), activation='softmax')
        
    # training
    net = tflearn.regression(net, loss='categorical_crossentropy', optimizer='adam', learning_rate=0.001)
    
    model = tflearn.DNN(net, tensorboard_verbose=0)
    return model

In [149]:
model = build_model()

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
Instructions for updating:
Please switch to tf.summary.merge.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [151]:
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=32, n_epoch=20)

Training Step: 11328  | total loss: [1m[32m0.46263[0m[0m
| Adam | epoch: 020 | loss: 0.46263 - acc: 0.8273 | val_loss: 0.45417 - val_acc: 0.8312 -- iter: 15085/15085
Training Step: 11328  | total loss: [1m[32m0.46263[0m[0m
| Adam | epoch: 020 | loss: 0.46263 - acc: 0.8273 | val_loss: 0.45417 - val_acc: 0.8312 -- iter: 15085/15085
--


In [152]:
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.829844337091
