In [26]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import bokeh
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns

from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
import scipy.special


In [27]:
from bokeh.io import output_notebook, show
output_notebook()

In [28]:
pd.set_option("display.max_columns", 1000)


In [29]:
log = pd.read_csv("./data/log.tsv", sep="\t")
events = pd.read_csv("./data/events.tsv", sep="\t")
users = pd.read_csv("./data/users.tsv", sep="\t")


In [30]:
log.loc[:,"user_id_str"] = log["user_id"].apply(lambda x:str(x))
log.loc[:,"event_id_str"] = log["event_id"].apply(lambda x:str(x))
print(log.shape)
log.head()

(1452089, 9)


Unnamed: 0,user_id,event_id,time_stamp,action_type,num_of_people,payment_method,total_price,user_id_str,event_id_str
0,1,6261,2015-03-19 09:15:50,3,1.0,クレカ,4000.0,1,6261
1,1,127600,2017-06-05 12:33:17,1,,,,1,127600
2,1,127600,2017-06-05 12:37:09,1,,,,1,127600
3,1,125296,2017-06-06 14:01:27,1,,,,1,125296
4,1,125128,2017-06-06 14:03:57,1,,,,1,125128


In [36]:
#量が多いから一時的に減らす
log = log.sample(n = 10000
                 , random_state=0)
print(log.shape)

(10000, 9)


In [37]:
users_interactions_count_df = log.groupby(["user_id", "event_id"]).size().groupby("user_id").size()
print(users_interactions_count_df.shape)
users_interactions_count_df.head()

(7904,)


user_id
1     8
6     2
13    1
21    1
22    1
dtype: int64

In [38]:
#ユーザーで層化してtrain_test_splitするには各人最低２回記録されてないといけない
#tutorialにしたがって5回にしておく

users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5 ].reset_index()[["user_id"]]
print(users_with_enough_interactions_df.shape)
display(users_with_enough_interactions_df.head())

(72, 1)


Unnamed: 0,user_id
0,1
1,1923
2,3570
3,4217
4,4822


In [39]:
interactions_from_selected_users_df = log.merge(users_with_enough_interactions_df, how ="right", left_on = "user_id", right_on = "user_id")
print(interactions_from_selected_users_df.shape)
display(interactions_from_selected_users_df.head())

(479, 9)


Unnamed: 0,user_id,event_id,time_stamp,action_type,num_of_people,payment_method,total_price,user_id_str,event_id_str
0,17886,93583,2017-01-12 16:05:32,2,,,,17886,93583
1,17886,109228,2017-04-21 10:31:19,2,,,,17886,109228
2,17886,132634,2017-06-19 16:54:26,2,,,,17886,132634
3,17886,132639,2017-05-26 01:41:15,2,,,,17886,132639
4,17886,143570,2017-06-20 01:57:37,1,,,,17886,143570


In [40]:
#複数回action_typeが記録された時に"smooth"にするらしい（よくわからない）
def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df = interactions_from_selected_users_df.groupby(["user_id","event_id"])["action_type"].sum().apply(smooth_user_preference).reset_index()
print(interactions_full_df.shape)
display(interactions_full_df.head())

(475, 3)


Unnamed: 0,user_id,event_id,action_type
0,1,109245,1.0
1,1,119556,1.0
2,1,128589,1.0
3,1,133128,1.0
4,1,133827,1.0


In [41]:
#ユーザーで層化
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df, stratify = interactions_full_df["user_id"],
                                                              test_size = 0.20, random_state=42)
print(interactions_train_df.shape)
print(interactions_test_df.shape)

(380, 3)
(95, 3)


In [42]:
#user_idをindexにする
interactions_full_indexed_df = interactions_full_df.set_index('user_id')
interactions_train_indexed_df = interactions_train_df.set_index('user_id')
interactions_test_indexed_df = interactions_test_df.set_index('user_id')
interactions_test_indexed_df.head()

Unnamed: 0_level_0,event_id,action_type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
22247,132253,1.0
17690,64706,1.584963
18051,158049,1.0
23169,141866,1.0
16777,91240,1.584963


In [43]:
def get_items_interacted(person_id, interactions_df):
    interacted_items = interactions_df.loc[person_id]["event_id"]
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [48]:
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100


class ModelEvaluator:
    
    def get_not_interacted_items_sample(self, person_id, sample_size, seed =42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(log["user_id"])
        non_interacted_items = all_items - interacted_items
        
        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)
    
    def _verify_hit_top_n(self, item_id, recommended_items, topn):
        try:
            index = next(i for i,c in enumerate(recommended_items) if c == item_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index
    
    def evaluate_model_for_user(self, model, person_id):
        
        
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        
        
        if type(interacted_values_testset["event_id"]) == pd.Series:
            
            person_interacted_items_testset = set(interacted_values_testset["event_id"])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset["event_id"])])
            
            
        interacted_items_count_testset = len(person_interacted_items_testset)
        
        person_recs_df = model.recommend_items(person_id, items_to_ignore = get_items_interacted(person_id, interactions_train_indexed_df),
                                              topn = 1000)
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        
        for item_id in person_interacted_items_testset:
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                        sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, seed = item_id%(2**32))
            
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))
            
            #print(person_recs_df[person_recs_df["event_id"].isin(items_to_filter_recs)])
            
            valid_recs_df = person_recs_df[person_recs_df["event_id"].isin(items_to_filter_recs)]
            
            valid_recs = valid_recs_df["event_id"].values
            
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10
            
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)
            
        person_metrics = {"hit@5_count": hits_at_5_count,
                             "hit@10_count": hits_at_10_count,
                             "interacted_count": interacted_items_count_testset,
                             "recall@5": recall_at_5,
                             "recall@10": recall_at_10}
            
        return person_metrics
        
    def evaluate_model(self, model):
        print ("Ruuning evaluation for users")
            
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            if idx % 100 == 0 and idx > 0:
                print("%d userd processed" % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics["_person_id"] = person_id
            people_metrics.append(person_metrics)
        print("%d users processed" % idx)
            
        detailed_results_df = pd.DataFrame(people_metrics).sort_values("interacted_count", ascending=False)
            
        global_recall_at_5 = detailed_results_df["hit@5_count"].sum()/float(detailed_results_df["interacted_count"].sum())
        global_recall_at_10 = detailed_results_df["hit@10_count"].sum()/float(detailed_results_df["interacted_count"].sum())
            
        global_metrics = {"modelName": model.get_model_name(),
                             "recall@5": global_recall_at_5,
                              "recall@10": global_recall_at_10
                             }
        return global_metrics, detailed_results_df
        
model_evaluator = ModelEvaluator()
            
            

# popularity推薦

In [49]:
#人気のeventを推薦
item_popularity_df = interactions_full_df.groupby("event_id")["action_type"].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,event_id,action_type
0,154646,2.584963
1,10364,2.0
2,109965,2.0
3,139907,2.0
4,54998,2.0
5,56561,2.0
6,126512,2.0
7,155314,2.0
8,150141,2.0
9,89834,2.0


In [50]:
class PopularityRecommender:
    
    MODEL_NAME = "Popularity"
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        recommendations_df = self.popularity_df[~self.popularity_df["event_id"].isin(items_to_ignore)].sort_values("action_type", ascending = False).head(topn)
        
        
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
                
            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'event_id', 
                                                          right_on = 'evnt_id')[['action_type', 'event_id']]
            
            
        return recommendations_df
        
popularity_model = PopularityRecommender(item_popularity_df, log)

In [51]:
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print("\nGlobal metrics:\n%s" % pop_global_metrics)
pop_detailed_results_df.head(10)

Ruuning evaluation for users
71 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 1.0, 'recall@10': 1.0}


Unnamed: 0,_person_id,hit@10_count,hit@5_count,interacted_count,recall@10,recall@5
45,3570,3,3,3,1.0,1.0
2,18051,3,3,3,1.0,1.0
28,8153,3,3,3,1.0,1.0
41,6834,3,3,3,1.0,1.0
37,18941,2,2,2,1.0,1.0
30,29217,2,2,2,1.0,1.0
44,22067,2,2,2,1.0,1.0
31,17079,2,2,2,1.0,1.0
18,1,2,2,2,1.0,1.0
50,22406,2,2,2,1.0,1.0
