In [None]:
%load_ext Cython
%%time
%%cython

In [367]:
#import needed packages
from gensim.models import doc2vec

import re
import pandas as pd
import numpy as np
import copy

# nltk processing
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [357]:
# Dataset manipulation

# read in toy dataset
filepath = "data/skincare.csv"
data = pd.read_csv(filepath)

# subset for just reviews to build a model
subset = data[['Product','ReviewContent']]
subset = subset.dropna()

# get a unique review corpus
review_docs = subset.groupby(['Product'])['ReviewContent'].apply(lambda x: ''.join(x)).reset_index()


In [354]:
stopwrds = stopwords.words('english')

In [368]:
# aux function to clean up text
def cleaning_text(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = re.sub('[^\w\s]',' ', sentence)
    sentence = re.sub('_',' ', sentence)
    sentence = re.sub('\d+',' ', sentence)
    cleaned = ' '.join([w for w in sentence.split() if not w in stopwrds])
    cleaned = ' '.join([w for w , pos in pos_tag(cleaned.split()) if (pos == 'NN' or pos=='JJ' or pos=='JJR' or pos=='JJS' )])
    cleaned = ' '.join([w for w in cleaned.split() if not len(w)<=2 ])
    cleaned = cleaned.strip()
    return cleaned

In [None]:
# clean up review docs
review_docs['ReviewContentClean'] = review_docs.apply(lambda row: cleaning_text(row['ReviewContent']), axis=1)

In [381]:
# define a few functions for doc2vec processing
def split_sentence(sentence):
    words = re.split('\W+', sentence.lower())
    return [word for word in words if word != ""]

# MyDocs reading from a data frame
class MyDocs(object):
    def __iter__(self):
        for i in range(review_docs.shape[0]):
            yield doc2vec.LabeledSentence(words=split_sentence(review_docs.iloc[i,2]), tags=['%s' % i])

In [382]:
# Train the doc2vec model
mydocs = MyDocs()
model = doc2vec.Doc2Vec(mydocs, size = 200, window = 8, min_count = 5, workers = 4)
model.save("data/review.model")

In [383]:
# testing similar words
print model.most_similar(positive=["cheap", "lemon"], negative=["acne"], topn=5)

[('friendly', 0.8479662537574768), ('sweet', 0.7924125790596008), ('paste', 0.7893930077552795), ('vanilla', 0.7782337665557861), ('sturdy', 0.7691439986228943)]


In [387]:
# Auxiliary functions for simple recommendation system 

# Calculate cosine similarity between two vecotrs 
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2)) 

# return top_n values from a list
def top_n(l,n):
    return sorted(range(len(l)), key=lambda i: l[i])[-(n+1):-1] #-1 to take off the own product from the returned index list

# return a dataframe with top_n products with cosine similarity score
def recommend(l,n):
    # get indexes of top_n similar items 
    index = top_n(sim_array, n)
    
    # return list of products 
    top_products = copy.deepcopy(review_docs.iloc[top_n(sim_array, n),][['Product']])
    top_products['index']= top_products.index
    
    # initialize an empty column for cosine similarity
    top_products['cossim']=0
    
    for i in range(len(index)):
        # note - this results in a bug: top_products['cossim'].loc[index[i],] = sim_array[index[i]]
        top_products.loc[index[i],'cossim'] = sim_array[index[i]]
        
    return top_products   

In [397]:
# "Basic recommendation system with doc2vec

# Select a index of a product you like
i = 2
input_vec = model.docvecs[2]

# Calculated a cossim matrix between input text and model's vectors
sim_array = map(lambda v: cossim(input_vec, v), model.docvecs)

#recommendation
print "You liked this product::: %s" %review_docs.iloc[i,]['Product']
recommend(sim_array,5)

You liked this product::: 100% Natural Aqua Tan Self Tanning Spray for Faces


Unnamed: 0,Product,index,cossim
3569,M.A.C. Mineralize Charged Water: Revitalizing ...,3569,0.862907
1446,Coppertone Sport Continuous Spray SPF 30,1446,0.864155
746,Bioelements WARM GLOW FACE TINT,746,0.87307
2442,GloMinerals gloBody Glisten,2442,0.887799
611,Banana Boat UltraMist Dark Tanning Lotion SPF ...,611,0.906903
