In [7]:
import os
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import numpy as np
from utility_extract_data import extract_data_from_file_and_generate_train_and_test
from numpy import linalg as la
import datetime
import sqlite3
import time

In [2]:
def calculate_metrics(test, rec):
    starttime = datetime.datetime.now()
    hit = 0

    all__for_recall = 0
    all__for_precision = 0
    for user in test.keys():
        history = test[user][0]
        answer = test[user][1]
        tu = [x[0] for x in answer]
        rank = rec[user] # self.recommend(history, N)
        #print 'rank:', rank
        for item, pui in rank:
            if item in tu:
                hit += 1
        all__for_recall += len(tu)
        all__for_precision += len(rank) #Note: In book RSP, the author used 'all += N'

    metric_recall = None
    metric_precision = None
    metric_f1 = None
    if 0 == all__for_recall:
        metric_recall = 0
    else:
        metric_recall = hit / (all__for_recall * 1.0)

    if 0 == all__for_precision:
        metric_precision = 0
    else:
        metric_precision = hit / (all__for_precision * 1.0)

    if 0 == all__for_recall or 0 == all__for_precision:
        metric_f1 = 0
    else:
        metric_f1 = 2/(1./metric_precision + 1./metric_recall)

    endtime = datetime.datetime.now()
    interval = (endtime - starttime).seconds
    print 'metric calculation: time consumption: %d' % (interval)
    return {'recall': metric_recall, 'precision': metric_precision, 'f1': metric_f1}

In [3]:
genres = ["Action",
"Adventure",
"Animation",
"Children's",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western",]

In [5]:
genres_index_dict = dict(zip(*[genres, range(len(genres))]))
index_genres_dict = dict(zip(*[range(len(genres)), genres]))

In [4]:
genres_index_dict

{'Action': 0,
 'Adventure': 1,
 'Animation': 2,
 "Children's": 3,
 'Comedy': 4,
 'Crime': 5,
 'Documentary': 6,
 'Drama': 7,
 'Fantasy': 8,
 'Film-Noir': 9,
 'Horror': 10,
 'Musical': 11,
 'Mystery': 12,
 'Romance': 13,
 'Sci-Fi': 14,
 'Thriller': 15,
 'War': 16,
 'Western': 17}

In [9]:

def extract_genres(filename, delimiter, genre_delimiter):
    data = {}

    with open(filename , 'r') as f:
        for i, line in enumerate(f):
            itemId, title, genre_list = map(lambda x: x.strip(), line.split(delimiter))
            
            data[itemId] = genre_list.split(genre_delimiter)
    return data


In [5]:
item_file_name, item_file_delimiter, genre_delimiter = os.path.sep.join(['ml-1m', 'movies.dat']), '::', '|'
item_info = extract_genres(item_file_name, item_file_delimiter, genre_delimiter)


In [52]:
print item_info

{'3734': ['Drama'], '3724': ['Drama', 'War'], '3725': ['Animation', 'Musical'], '3798': ['Thriller'], '3726': ['Action', 'Thriller'], '344': ['Comedy'], '345': ['Comedy', 'Drama'], '346': ['Drama', 'Musical'], '347': ['Drama'], '340': ['Adventure', 'Drama'], '341': ['Drama'], '342': ['Comedy', 'Romance'], '343': ["Children's"], '2918': ['Comedy'], '348': ['Comedy'], '349': ['Action', 'Adventure', 'Thriller'], '3002': ['Documentary'], '2919': ['Drama', 'Romance'], '1653': ['Drama', 'Sci-Fi', 'Thriller'], '2318': ['Comedy'], '2319': ['Comedy'], '2316': ['Drama', 'Romance'], '2317': ['Comedy'], '2314': ['Drama'], '2315': ['Horror', 'Thriller'], '2312': ['Drama'], '2313': ['Drama'], '2310': ['Drama'], '2311': ['Mystery', 'Sci-Fi'], '298': ['Comedy'], '299': ['Drama'], '296': ['Crime', 'Drama'], '297': ['Drama'], '294': ['Comedy', 'Romance'], '295': ['Comedy', 'Romance'], '292': ['Action', 'Drama', 'Thriller'], '293': ['Crime', 'Drama', 'Romance', 'Thriller'], '290': ['Crime', 'Drama'], '29

In [25]:
index_genres_dict[7]

'Drama'

In [6]:
def tmp_set(vec, i, val):
    vec[i] = val

In [7]:
def generate_item_repr(item_info):
    item_repr = {}
    for item in item_info:
        #print item
        f = np.array([0] * len(genres))
        map(lambda x: tmp_set(f, genres_index_dict[x], 1), item_info[item])
        #print f
        item_repr[item] = f
        #break
    return item_repr

In [8]:
item_repr = generate_item_repr(item_info)

In [40]:
item_repr

{'593': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 '2031': array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '1869': array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 '1868': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 '643': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '592': array([1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '344': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '345': array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '346': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 '347': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '340': array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '341': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '342': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 '343': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 '3

In [54]:
item_repr['1']

array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [9]:

all_items = set(item_repr.keys())
print len(all_items)

3883


In [10]:
def generate_user_repr(user_history, item_repr, item_repr_size):
    #user_repr = np.sum(map(lambda (item, rate, timestamp): item_repr[item] * rate, user_history), axis=0) * 1.
    user_repr = np.sum(map(lambda (item, rate, timestamp): item_repr[item] * rate, user_history), axis=0) * 1. / sum([int(rate) for item, rate, timestamp in user_history])
    return user_repr
                                                                                                                    

In [12]:
data_filename, delimiter, data_set = os.path.sep.join(['ml-1m', 'ratings.dat']), '::', '1M'
#data_filename, delimiter = os.path.sep.join(['ml-10M100K', 'ratings.dat']), '::'
#data_filename, delimiter, data_set = os.path.sep.join(['ml-100k', 'u.data']), '\t', '100K'

N = 20
seed = 2 
K = 10
train_percent = 0.8
test_data_inner_ratio = 0.8
test = None
train, original_test = extract_data_from_file_and_generate_train_and_test(data_filename, train_percent, seed, delimiter, test_data_inner_ratio)
#train, test = extract_data_from_file_and_generate_train_and_test(data_filename, 3, 0, seed, delimiter)


sort_by_time: False


In [13]:

user_repr = {}
for user in original_test:
    history, future = original_test[user]
    r = generate_user_repr(history, item_repr, len(genres))
    #print r
    user_repr[user] = r
    
    

In [14]:
len(user_repr)

1200

In [16]:
# main: core of content-based recommendation

total_user_item_comb = 0
rec = {}
for user in original_test:
    history, future = original_test[user]
    history_items = set([x[0] for x in history])
    candidates = all_items - history_items # filtering out those interacted
    
    total_user_item_comb += len(candidates)
    cand_simi_list = []
    for candy in candidates:
        simi = user_repr[user].dot(item_repr[candy]) / (la.norm(user_repr[user]) * la.norm(item_repr[candy]))
        cand_simi_list.append((candy, simi))
        
    cand_simi_list.sort(key=lambda x: -1 * x[1])
    
    rec[user] = cand_simi_list[:N]
    
print 'total_user_item_comb:', total_user_item_comb

total_user_item_comb: 4619056


In [17]:
calculate_metrics(original_test, rec)

metric calculation: time consumption: 0


{'f1': 0.02121629193617704,
 'precision': 0.08125,
 'recall': 0.01220115003660345}

In [73]:
calculate_metrics(original_test, rec)

metric calculation: time consumption: 0


{'f1': 0.02121629193617704,
 'precision': 0.08125,
 'recall': 0.01220115003660345}

In [11]:
# time overhead


def tmp_set(vec, i, val):
    vec[i] = val

def extract_genres(filename, delimiter, genre_delimiter):
    data = {}

    with open(filename , 'r') as f:
        for i, line in enumerate(f):
            itemId, title, genre_list = map(lambda x: x.strip(), line.split(delimiter))
            
            data[itemId] = genre_list.split(genre_delimiter)
    return data

    
def generate_item_repr(item_info):
    item_repr = {}
    for item in item_info:
        #print item
        f = np.array([0] * len(genres))
        map(lambda x: tmp_set(f, genres_index_dict[x], 1), item_info[item])
        #print f
        item_repr[item] = f
        #break
    return item_repr




def generate_user_repr(user_history, item_repr, item_repr_size):
    #user_repr = np.sum(map(lambda (item, rate, timestamp): item_repr[item] * rate, user_history), axis=0) * 1.
    user_repr = np.sum(map(lambda (item, rate, timestamp): item_repr[item] * rate, user_history), axis=0) * 1. / sum([int(rate) for item, rate, timestamp in user_history])
    return user_repr
      

###    
    


###

data_filename, delimiter, data_set = os.path.sep.join(['ml-1m', 'ratings.dat']), '::', '1M'
#data_filename, delimiter = os.path.sep.join(['ml-10M100K', 'ratings.dat']), '::'
#data_filename, delimiter, data_set = os.path.sep.join(['ml-100k', 'u.data']), '\t', '100K'

N = 20
seed = 2 
train_percent = 0.8
test_data_inner_ratio = 0.8
test = None


train_percent_list = [0.5, 0.6, 0.7, 0.8,]# 0.9]
test_fixed_ratio = 0.2

###

table_name_prefix = 'metrics__chap4_exp_X_time_complexity__content_based__N_%d__da_%s'
table_name = table_name_prefix % (N, data_set)
print 'table_name:', table_name


cx = sqlite3.connect('my_metrics.db')
cur = cx.cursor()

cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='%s';" % table_name)
ret = cur.fetchall()
if 0 == len(ret):
    sql = '''create table %s (
_row_ID integer	primary key autoincrement,

size integer,
min_count integer,
window integer,

train_percent decimal(30, 28),

precision decimal(30, 28),
recall decimal(30, 28),
f1 decimal(30, 28),

train_overhead integer,
test_overhead integer,
overall_overhead integer,

CreatedTime TimeStamp NOT NULL DEFAULT (datetime('now','localtime'))
);''' % (table_name)
    cur.execute(sql)
    cx.commit()

###

for i, train_percent in enumerate(train_percent_list):
    starttime = time.time()               ######################################
    
    train, original_test = extract_data_from_file_and_generate_train_and_test(data_filename, 
                                                                              train_percent, 
                                                                              seed, 
                                                                              delimiter, 
                                                                              test_data_inner_ratio,
                                                                              test_fixed_ratio=test_fixed_ratio)
    #train, test = extract_data_from_file_and_generate_train_and_test(data_filename, 3, 0, seed, delimiter)


    
    item_file_name, item_file_delimiter, genre_delimiter = os.path.sep.join(['ml-1m', 'movies.dat']), '::', '|'
    item_info = extract_genres(item_file_name, item_file_delimiter, genre_delimiter)


    item_repr = generate_item_repr(item_info)

    all_items = set(item_repr.keys())
    print len(all_items)


    
    
    user_repr = {}
    for user in original_test:
        history, future = original_test[user]
        r = generate_user_repr(history, item_repr, len(genres))
        #print r
        user_repr[user] = r
        
    ########################################################
    
    
    # main: core of content-based recommendation

    total_user_item_comb = 0
    rec = {}
    for user in original_test:
        history, future = original_test[user]
        history_items = set([x[0] for x in history])
        candidates = all_items - history_items # filtering out those interacted

        total_user_item_comb += len(candidates)
        cand_simi_list = []
        for candy in candidates:
            simi = user_repr[user].dot(item_repr[candy]) / (la.norm(user_repr[user]) * la.norm(item_repr[candy]))
            cand_simi_list.append((candy, simi))

        cand_simi_list.sort(key=lambda x: -1 * x[1])

        rec[user] = cand_simi_list[:N]

    print 'total_user_item_comb:', total_user_item_comb



    metrics = calculate_metrics(original_test, rec)
    
    endtime = time.time()
    total_overhead = endtime - starttime
    print 'content-based total time consumption: %g' % (total_overhead)

    print metrics

    precision, recall, f1 = metrics['precision'], metrics['recall'], metrics['f1']

    cur.execute('insert into %s (size, min_count, window, train_percent, precision, recall, f1, train_overhead, test_overhead, overall_overhead)' % (table_name) +
               "values (%d, %d, %d, %.19f, %.19f, %.19f, %.19f, %.19f, %.19f, %.19f)" % (-1, -1, -1, train_percent, precision, recall, f1, -1, -1, total_overhead))

    cx.commit()
    
cur.close()
cx.close()

table_name: metrics__chap4_exp_X_time_complexity__content_based__N_20__da_1M
len(train): 3009
len(test): 604
sort_by_time: False
3883
total_user_item_comb: 2326241
metric calculation: time consumption: 0
content-based total time consumption: 18.8098
{'recall': 0.014857674913540835, 'precision': 0.09246688741721855, 'f1': 0.025601650240660093}
len(train): 3608
len(test): 725
sort_by_time: False
3883
total_user_item_comb: 2790630
metric calculation: time consumption: 0
content-based total time consumption: 22.0611
{'recall': 0.014452749433985671, 'precision': 0.09641379310344828, 'f1': 0.025137329293619472}
len(train): 4217
len(test): 846
sort_by_time: False
3883
total_user_item_comb: 3256413
metric calculation: time consumption: 0
content-based total time consumption: 25.709
{'recall': 0.013029287070051266, 'precision': 0.08682033096926714, 'f1': 0.022658214181049772}
len(train): 4840
len(test): 967
sort_by_time: False
3883
total_user_item_comb: 3722074
metric calculation: time consumpt