In [21]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import gensim
from gensim import corpora, models, similarities
import os
import utility_synopsis
import math
import numpy as np
from numpy import linalg as la
from scipy.spatial.distance import cosine
import pandas as pd
import matplotlib.pyplot as plt
from utility_extract_data import extract_data_from_file_and_generate_train_and_test
import datetime

In [2]:
def extract_item_info(filename, delimiter, genre_delimiter):
    data = {}

    with open(filename , 'r') as f:
        for i, line in enumerate(f):
            itemId, title, genre_list = map(lambda x: x.strip(), line.split(delimiter))
            
            data[itemId] = (title, genre_list.split(genre_delimiter))
    return data

In [3]:
item_file_name, item_file_delimiter, genre_delimiter = os.path.sep.join(['ml-1m', 'movies.dat']), '::', '|'
item_info = extract_item_info(item_file_name, item_file_delimiter, genre_delimiter)

In [4]:
def extract_user_item_interaction(filename, delimiter):
    data = {}

    with open(filename , 'r') as f:
        for i, line in enumerate(f):
            userId, movieId, rating, timestamp = line.split(delimiter)
            #userId = int(userId)
            #movieId = int(movieId)
            rating = float(rating)
            timestamp = int(timestamp)

            if userId not in data:
                data[userId] = []
            data[userId].append((movieId, rating, timestamp))
    
    # order by time
    for userId in data:
        data[userId].sort(key=lambda x: x[2]) 
    return data

In [5]:
rating_file_name, rating_file_delimiter = os.path.sep.join(['ml-1m', 'ratings.dat']), '::'
user_item_interaction = extract_user_item_interaction(rating_file_name, rating_file_delimiter)

In [62]:
model_path = '/home/wsyj/dissertation__recommendation_system__experiment_2/dissertation__recommendation_system__experiment/main_modelnum_features=100_min_count=1_window=1_iter=30.model'

#model = gensim.models.Word2Vec.load('/home/wsyj/dissertation__recommendation_system__experiment_2/dissertation__recommendation_system__experiment/main_modelnum_features=200_min_count=5_window=2.model' )
model = gensim.models.Word2Vec.load(model_path)

2017-02-26 21:14:27,071 : INFO : loading Word2Vec object from /home/wsyj/dissertation__recommendation_system__experiment_2/dissertation__recommendation_system__experiment/main_modelnum_features=100_min_count=1_window=1_iter=30.model
2017-02-26 21:14:27,079 : INFO : loading wv recursively from /home/wsyj/dissertation__recommendation_system__experiment_2/dissertation__recommendation_system__experiment/main_modelnum_features=100_min_count=1_window=1_iter=30.model.wv.* with mmap=None
2017-02-26 21:14:27,080 : INFO : setting ignored attribute syn0norm to None
2017-02-26 21:14:27,081 : INFO : setting ignored attribute cum_table to None
2017-02-26 21:14:27,081 : INFO : loaded /home/wsyj/dissertation__recommendation_system__experiment_2/dissertation__recommendation_system__experiment/main_modelnum_features=100_min_count=1_window=1_iter=30.model


In [63]:
def user_history2user_repr__simple_average(model, target_user_history): # target_user_history: It should_be_a_list_of_tuples_included_items.
    #print 'target_user_history:', target_user_history
    items_existed_in_model = filter(lambda x: x[0] in model, target_user_history)
    #print 'items_existed_in_model:', items_existed_in_model[0]
    items_translated_to_vecs = map(lambda x: model[x[0]], items_existed_in_model)
    #print 'items_translated_to_vecs:', items_translated_to_vecs[0]
    return np.average(items_translated_to_vecs, axis=0)   

#user_history2user_repr__simple_average(model, user_item_interaction['5989'])   

In [64]:
# calculate user representation dict
user_repr = {user: user_history2user_repr__simple_average(model, user_item_interaction[user]) 
             for user in user_item_interaction}

In [23]:
user_repr.keys()

['5988',
 '5989',
 '5982',
 '5983',
 '5980',
 '5981',
 '5986',
 '5987',
 '5984',
 '5985',
 '2147',
 '270',
 '271',
 '272',
 '273',
 '274',
 '275',
 '276',
 '277',
 '278',
 '279',
 '3282',
 '3519',
 '3518',
 '3513',
 '3512',
 '3511',
 '3510',
 '3517',
 '3516',
 '3515',
 '3514',
 '2688',
 '2689',
 '2684',
 '2685',
 '2686',
 '2687',
 '2680',
 '2681',
 '2682',
 '2683',
 '99',
 '98',
 '91',
 '90',
 '93',
 '92',
 '95',
 '94',
 '97',
 '96',
 '1177',
 '1176',
 '1175',
 '1174',
 '1173',
 '1172',
 '1171',
 '1170',
 '1179',
 '1178',
 '3430',
 '3431',
 '3432',
 '623',
 '3433',
 '622',
 '3434',
 '1225',
 '873',
 '620',
 '870',
 '627',
 '871',
 '626',
 '2741',
 '625',
 '2740',
 '624',
 '2743',
 '4591',
 '2033',
 '2032',
 '4594',
 '2746',
 '4844',
 '2748',
 '4846',
 '4847',
 '2318',
 '2319',
 '4842',
 '2038',
 '2314',
 '2315',
 '393',
 '392',
 '391',
 '390',
 '397',
 '396',
 '395',
 '394',
 '399',
 '398',
 '2309',
 '2308',
 '4729',
 '4728',
 '4725',
 '2300',
 '4727',
 '4726',
 '4721',
 '4720',
 '4723

In [24]:
user_repr['5988']

array([-0.03484509,  0.00654564,  0.06213293,  0.05387489, -0.00664007,
       -0.09959972,  0.02847232, -0.06708601,  0.08689491,  0.00809671,
        0.06383108,  0.02636334, -0.00210022, -0.05542246,  0.04063263,
       -0.04685632, -0.03018443, -0.05917279,  0.01321097, -0.00438598,
       -0.03424281,  0.06791412, -0.02120749,  0.04363169,  0.02234998,
       -0.05442786,  0.06985757, -0.09017082,  0.04886865,  0.03038269,
       -0.01439436, -0.01206527, -0.07636985, -0.07028658,  0.04366411,
       -0.02848184,  0.03803007,  0.06678109, -0.02531245,  0.07419072,
       -0.01824262, -0.03785534, -0.03472118,  0.04004463,  0.0638015 ,
        0.02812973, -0.00802467, -0.09087835, -0.02159717, -0.02101968,
        0.02718035,  0.01172304,  0.0831147 ,  0.01467071,  0.06600871,
       -0.01777104,  0.07633819, -0.02450319, -0.05750057,  0.04789777,
        0.01380763,  0.02471607,  0.02312904, -0.096926  ,  0.05270377,
        0.02699856, -0.04527079, -0.07561062, -0.07507726, -0.00

In [25]:
all_items

{'1200',
 '3724',
 '3725',
 '344',
 '345',
 '346',
 '347',
 '340',
 '341',
 '342',
 '343',
 '348',
 '349',
 '1511',
 '1296',
 '2318',
 '2316',
 '2317',
 '2314',
 '2315',
 '2312',
 '2313',
 '2310',
 '2311',
 '2184',
 '298',
 '299',
 '296',
 '297',
 '294',
 '295',
 '292',
 '293',
 '290',
 '291',
 '3773',
 '3772',
 '3771',
 '3770',
 '3777',
 '3776',
 '3775',
 '3774',
 '3779',
 '3778',
 '270',
 '271',
 '272',
 '273',
 '274',
 '275',
 '276',
 '277',
 '278',
 '279',
 '2268',
 '2269',
 '2262',
 '2263',
 '2260',
 '2261',
 '2266',
 '2267',
 '2264',
 '2265',
 '2442',
 '2443',
 '2440',
 '2441',
 '2446',
 '2447',
 '2444',
 '2445',
 '2448',
 '2449',
 '2189',
 '108',
 '102',
 '103',
 '100',
 '101',
 '106',
 '107',
 '104',
 '105',
 '2046',
 '2047',
 '2044',
 '2045',
 '2042',
 '2043',
 '2040',
 '2041',
 '3198',
 '1297',
 '2048',
 '2049',
 '3150',
 '3155',
 '2038',
 '3159',
 '3519',
 '3518',
 '3513',
 '3512',
 '3511',
 '3510',
 '3517',
 '3516',
 '3515',
 '3514',
 '2688',
 '2689',
 '2685',
 '2686',
 '26

In [29]:
item_repr['3724']

array([-0.01638629,  0.07660951, -0.04166952,  0.21206224,  0.13059106,
        0.10559428,  0.08074671,  0.02424181,  0.03864932,  0.01416764,
        0.18459238,  0.12319092,  0.03002882, -0.17116457, -0.09813236,
        0.0255536 ,  0.03420775, -0.0109485 ,  0.01933366, -0.01654197,
       -0.10491117,  0.09822451,  0.02034667, -0.07618506,  0.09050623,
        0.04749753,  0.01029947, -0.07913234,  0.05650635, -0.03792978,
       -0.09261778, -0.15181945, -0.16756082, -0.20551488,  0.05435994,
        0.03303286,  0.02812913,  0.13210936, -0.06779315,  0.12415711,
        0.00837986, -0.04124662, -0.040571  ,  0.119642  ,  0.0918362 ,
        0.0989648 , -0.33841532, -0.14072426,  0.02748891,  0.02107903,
        0.17783499,  0.09334072,  0.06850144,  0.02023416,  0.14187962,
       -0.09905452,  0.11525086, -0.03564117, -0.20950764, -0.05931779,
        0.01041089,  0.0068032 , -0.0195644 , -0.16692513,  0.09984155,
        0.03391441, -0.02226016, -0.09310246, -0.23735467,  0.01

In [65]:
# item representation
item_repr = model
#print len(item_repr)

all_items = set(model.wv.vocab.keys())

In [34]:
all_items

{'1200',
 '3724',
 '3725',
 '344',
 '345',
 '346',
 '347',
 '340',
 '341',
 '342',
 '343',
 '348',
 '349',
 '1511',
 '1296',
 '2318',
 '2316',
 '2317',
 '2314',
 '2315',
 '2312',
 '2313',
 '2310',
 '2311',
 '2184',
 '298',
 '299',
 '296',
 '297',
 '294',
 '295',
 '292',
 '293',
 '290',
 '291',
 '3773',
 '3772',
 '3771',
 '3770',
 '3777',
 '3776',
 '3775',
 '3774',
 '3779',
 '3778',
 '270',
 '271',
 '272',
 '273',
 '274',
 '275',
 '276',
 '277',
 '278',
 '279',
 '2268',
 '2269',
 '2262',
 '2263',
 '2260',
 '2261',
 '2266',
 '2267',
 '2264',
 '2265',
 '2442',
 '2443',
 '2440',
 '2441',
 '2446',
 '2447',
 '2444',
 '2445',
 '2448',
 '2449',
 '2189',
 '108',
 '102',
 '103',
 '100',
 '101',
 '106',
 '107',
 '104',
 '105',
 '2046',
 '2047',
 '2044',
 '2045',
 '2042',
 '2043',
 '2040',
 '2041',
 '3198',
 '1297',
 '2048',
 '2049',
 '3150',
 '3155',
 '2038',
 '3159',
 '3519',
 '3518',
 '3513',
 '3512',
 '3511',
 '3510',
 '3517',
 '3516',
 '3515',
 '3514',
 '2688',
 '2689',
 '2685',
 '2686',
 '26

In [15]:
model.vocab



{'2031': <gensim.models.word2vec.Vocab at 0xa41c9a6c>,
 '1869': <gensim.models.word2vec.Vocab at 0xa41c3d4c>,
 '1868': <gensim.models.word2vec.Vocab at 0xa41c3d8c>,
 '643': <gensim.models.word2vec.Vocab at 0xa41ef26c>,
 '344': <gensim.models.word2vec.Vocab at 0xa475fd4c>,
 '345': <gensim.models.word2vec.Vocab at 0xa475fd6c>,
 '346': <gensim.models.word2vec.Vocab at 0xa475fd8c>,
 '347': <gensim.models.word2vec.Vocab at 0xa475fdac>,
 '340': <gensim.models.word2vec.Vocab at 0xa475fdcc>,
 '341': <gensim.models.word2vec.Vocab at 0xa475fdec>,
 '342': <gensim.models.word2vec.Vocab at 0xa475fe0c>,
 '343': <gensim.models.word2vec.Vocab at 0xa475fe2c>,
 '348': <gensim.models.word2vec.Vocab at 0xa475fe4c>,
 '349': <gensim.models.word2vec.Vocab at 0xa475fe6c>,
 '2318': <gensim.models.word2vec.Vocab at 0xa475ff0c>,
 '2316': <gensim.models.word2vec.Vocab at 0xa475ff4c>,
 '2317': <gensim.models.word2vec.Vocab at 0xa475ff8c>,
 '2314': <gensim.models.word2vec.Vocab at 0xa475ffcc>,
 '2315': <gensim.mode

In [66]:
def calculate_metrics(test, rec):
    starttime = datetime.datetime.now()
    hit = 0

    all__for_recall = 0
    all__for_precision = 0
    for user in test.keys():
        history = test[user][0]
        answer = test[user][1]
        tu = [x[0] for x in answer]
        rank = rec[user] # self.recommend(history, N)
        #print 'rank:', rank
        for item, pui in rank:
            if item in tu:
                hit += 1
        all__for_recall += len(tu)
        all__for_precision += len(rank) #Note: In book RSP, the author used 'all += N'

    metric_recall = None
    metric_precision = None
    metric_f1 = None
    if 0 == all__for_recall:
        metric_recall = 0
    else:
        metric_recall = hit / (all__for_recall * 1.0)

    if 0 == all__for_precision:
        metric_precision = 0
    else:
        metric_precision = hit / (all__for_precision * 1.0)

    if 0 == all__for_recall or 0 == all__for_precision:
        metric_f1 = 0
    else:
        metric_f1 = 2/(1./metric_precision + 1./metric_recall)

    endtime = datetime.datetime.now()
    interval = (endtime - starttime).seconds
    print 'metric calculation: time consumption: %d' % (interval)
    return {'recall': metric_recall, 'precision': metric_precision, 'f1': metric_f1}

In [67]:
# load train and test datasets

data_filename, delimiter, data_set = os.path.sep.join(['ml-1m', 'ratings.dat']), '::', '1M'
#data_filename, delimiter = os.path.sep.join(['ml-10M100K', 'ratings.dat']), '::'
#data_filename, delimiter, data_set = os.path.sep.join(['ml-100k', 'u.data']), '\t', '100K'

N = 20
seed = 2 
K = 10
train_percent = 0.8
test_data_inner_ratio = 0.8
test = None
train, original_test = extract_data_from_file_and_generate_train_and_test(data_filename, train_percent, seed, delimiter, test_data_inner_ratio)
#train, test = extract_data_from_file_and_generate_train_and_test(data_filename, 3, 0, seed, delimiter)


sort_by_time: False


In [68]:
# main: core of content-based recommendation

total_user_item_comb = 0
rec = {}
for user in original_test:
    history, future = original_test[user]
    history_items = set([x[0] for x in history])
    candidates = all_items - history_items # filtering out those interacted
    #print 'candidates:', candidates
    
    total_user_item_comb += len(candidates)
    cand_simi_list = []
    for candy in candidates:
        simi = user_repr[user].dot(item_repr[candy]) / (la.norm(user_repr[user]) * la.norm(item_repr[candy]))
        cand_simi_list.append((candy, simi))
        
    cand_simi_list.sort(key=lambda x: -1 * x[1])
    
    rec[user] = cand_simi_list[:N]
    
print 'total_user_item_comb:', total_user_item_comb

total_user_item_comb: 4387464


In [33]:
calculate_metrics(original_test, rec)

metric calculation: time consumption: 0


{'f1': 0.00023936329363892048,
 'precision': 0.0009166666666666666,
 'recall': 0.000137654000412962}

In [61]:
# cbow
calculate_metrics(original_test, rec)

metric calculation: time consumption: 0


{'f1': 0.00011968164681946024,
 'precision': 0.0004583333333333333,
 'recall': 6.8827000206481e-05}

In [69]:
# skip-gram
calculate_metrics(original_test, rec)

metric calculation: time consumption: 0


{'f1': 0.0002502434433497805,
 'precision': 0.0009583333333333333,
 'recall': 0.000143911000431733}