In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


import sqlite3
conn = sqlite3.connect('data/capstone.db')

# nltk processing
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

import re

In [None]:
# tblGame , tblMovie, tblReview, tblTVShow

In [4]:
games = pd.read_sql_query("select * from tblGame;", conn)
games['gameID'] = games.index
games.head(5)

Unnamed: 0,name,link,image,developer,genre,rating,rlsDate,summary,gameID
0,Ginger: Beyond the Crystal,/game/playstation-4/ginger-beyond-the-crystal,http://static.metacritic.com/images/products/g...,Drakhar Studio,Action,E,2016-10-25 00:00:00,"A world destroyed by a crystal explosion, a my...",0
1,JoJo's Bizarre Adventure: Eyes of Heaven,/game/playstation-4/jojos-bizarre-adventure-ey...,http://static.metacritic.com/images/products/g...,CyberConnect2,Action,T,2016-06-28 00:00:00,"Featuring a new two-on-two battle system, larg...",1
2,The Crew,/game/playstation-4/the-crew,http://static.metacritic.com/images/products/g...,"Ubisoft Reflections, Ivory Tower",Racing,T,2014-12-02 00:00:00,Strap in for a ride that will find you infiltr...,2
3,Batman: Arkham Knight - A Matter of Family,/game/playstation-4/batman-arkham-knight---a-m...,http://static.metacritic.com/images/products/g...,"Rocksteady Studios, WB Games Montreal",Action Adventure,M,2015-07-14 00:00:00,When The Joker kidnaps Commissioner Gordon and...,3
4,8DAYS,/game/playstation-4/8days,http://static.metacritic.com/images/products/g...,Santa Clara Games,Action,M,2017-02-07 00:00:00,The world has been dominated by dark forces si...,4


In [8]:
reviews = pd.read_sql_query("select * from tblReview;", conn)
reviews

Unnamed: 0,gameID,movieID,tvShowID,author,publication,text,score,date,thumbsUp,thumbsDown,reviewType
0,1.0,,,,Areajugones,Ginger: Beyond the Crystal is a very good plat...,69,2016-11-05 00:00:00,0,0,c
1,1.0,,,,Hobby Consolas,"It\s not a bad game at all, but it has technic...",67,2016-11-02 00:00:00,0,0,c
2,1.0,,,,GameGrin,Ginger: Beyond the Crystal tries to revisit th...,60,2016-11-07 00:00:00,0,0,c
3,1.0,,,,GameSpew,It may not offer much of a challenge or much i...,60,2016-11-05 00:00:00,0,0,c
4,1.0,,,,Digitally Downloaded,"It may sound like I’m being harsh on Ginger, b...",50,2016-11-21 00:00:00,0,0,c
5,2.0,,,,God is a Geek,JoJo’s Bizarre Adventure: Eyes in Heaven featu...,85,2016-07-04 00:00:00,0,0,c
6,2.0,,,,Gaming Age,"All in all, JoJo\s Bizarre Adventure: Eyes of ...",83,2016-07-06 00:00:00,0,0,c
7,2.0,,,,GameCritics,While I’m not familiar with the manga this gam...,75,2016-08-07 00:00:00,0,0,c
8,2.0,,,,IGN Spain,Fun and frantic but not as deep or as beautifu...,75,2016-07-11 00:00:00,0,0,c
9,2.0,,,,Atomix,"It tries to innovate, but it\s a game aimed fo...",75,2016-07-05 00:00:00,0,0,c


In [10]:
# combining reviews into a corpus for each disticnt id
subset = reviews[reviews['reviewType']=='c']
subset = subset[['gameID','text']]
subset = subset.dropna()
subset.head(5)

# get a unique review corpus
review_docs = subset.groupby(['gameID'])['text'].apply(lambda x: ''.join(x)).reset_index()
review_docs.head(5)


Unnamed: 0,gameID,text
0,1.0,Ginger: Beyond the Crystal is a very good plat...
1,2.0,JoJo’s Bizarre Adventure: Eyes in Heaven featu...
2,3.0,The Crew does a lot of things right. Not only ...
3,4.0,Playing as Batgirl is not quite as interesting...
4,5.0,The frantic nature of the gameplay coupled wit...


In [None]:
#### Cleaning up text 

In [27]:
stopwrds = stopwords.words('english')
# aux function to clean up text
def cleaning_text(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = re.sub('[^\w\s]',' ', sentence)
    sentence = re.sub('_',' ', sentence)
    sentence = re.sub('\d+',' ', sentence)
    cleaned = ' '.join([w for w in sentence.split() if not w in stopwrds])
    cleaned = ' '.join([w for w , pos in pos_tag(cleaned.split()) if (pos == 'NN' or pos=='JJ' or pos=='JJR' or pos=='JJS' )])
    cleaned = ' '.join([w for w in cleaned.split() if not len(w)<=2 ])
    cleaned = cleaned.strip()
    return cleaned

In [28]:
# add utf-8 encoding, clean words
review_docs['textClean'] = review_docs.apply(lambda row: cleaning_text(row['text'].encode("utf8")), axis=1)

In [29]:
corpus = list(review_docs.textClean)
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
# tfidf matrix
tfidf_matrix = vectorizer.fit_transform(corpus)
idf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print tfidf_matrix.todense()

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [34]:
tfidf_matrix = tfidf_matrix.todense()

In [35]:
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2)) 

In [38]:
# recommendation based on tfidf
from sklearn.metrics.pairwise import linear_kernel
 
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [None]:
# save a pickle file for for tf_idf matrix
import pickle

with open('models/tfidf_matrix.pickle', 'wb') as handle:
    pickle.dump(tfidif_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
find_similar(tfidf_matrix,0)

[(1861, 0.66263344663863655),
 (5760, 0.35865918763360233),
 (3160, 0.28981714836649575),
 (3866, 0.25455565924803897),
 (1373, 0.24464133283748662)]