In [1]:
import pandas as pd
from scipy import sparse
from implicit.nearest_neighbours import CosineRecommender

In [2]:
train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

In [3]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# popularity

In [4]:
item_popularity = train_df[train_df['like']==1].groupby('item_id')[['user_id']].count()
item_popularity = item_popularity.sort_values('user_id', ascending=False)
top_60 = item_popularity.index[:60]
item_popularity.head(3)

Unnamed: 0_level_0,user_id
item_id,Unnamed: 1_level_1
76,97
22,77
35,76


In [6]:
user_watched = train_df.groupby('user_id')['item_id'].agg(list)
user_watched.head(3)

user_id
0    [125, 29, 96, 213, 212, 89, 54, 1, 108, 214, 2...
1    [54, 20, 0, 27, 108, 213, 210, 4, 132, 5, 3, 2...
2    [163, 205, 229, 231, 222, 227, 221, 6, 224, 23...
Name: item_id, dtype: object

In [7]:
def array_difference(first, second):
    """ Return elements from first array but not in second
    """
    return [item for item in first if item not in second]


def recommend_popular(user_id, N=20):
    """ Recommend N popular items except already watched items
    """
    watched_history = user_watched.iloc[user_id]
    recomends = array_difference(top_60, watched_history)
    return recomends[:N]

In [9]:
recommendation_baseline = []
for user_id in test_df['user_id']:
    user_recs = recommend_popular(user_id)
    user_recs.append(user_id)  # add user_id column
    recommendation_baseline.append(user_recs)

In [10]:
column_names = [str(i) for i in range(20)]
column_names.append('user_id')
baseline_df = pd.DataFrame(recommendation_baseline, columns=column_names)

0.5392 on public

In [11]:
baseline_df.to_csv('submit_01.csv', index=False)

# implicit

In [12]:
def revert_dict(dic):
    """ Reverting dictionary from k:v to v:k
    """
    return {v:k for k,v in dic.items()}


def make_sparse(data_df, user_col='user_id', items_col='item_id', target_col='like'):
    """ Converting pandas interaction DataFrame to csr sparse matrix
        data_df: DataFrame with user/item iteractions
        user_col: name of user ids column
        user_col: name of item ids column
        target_col: name of target ids column, if you have implicit feedback, just create column
                    with constant value
    """
    data_df = data_df.copy()
    feedback_df[user_col] = feedback_df[user_col].astype('category')
    feedback_df[items_col] = feedback_df[items_col].astype('category')
    feedback_df[target_col] = feedback_df[target_col].astype('float')
    
    user_model_real_dict = dict(enumerate(feedback_df[user_col].cat.categories))
    item_model_real_dict = dict(enumerate(feedback_df[items_col].cat.categories))
    
    data_coo = sparse.coo_matrix((feedback_df[target_col],
                                 (feedback_df[user_col].cat.codes, feedback_df[items_col].cat.codes)))

    return data_coo.tocsr(), user_model_real_dict, item_model_real_dict

In [13]:
feedback_df = train_df[train_df['like'] == 1].copy()

train_csr, user_model_real, item_model_real = make_sparse(feedback_df)
user_real_model = revert_dict(user_model_real)

In [14]:
model = CosineRecommender(2)
model.fit(train_csr.T)

HBox(children=(FloatProgress(value=0.0, max=203.0), HTML(value='')))




In [15]:
recommendations = []
N_recommendations = 20
for user_id in test_df['user_id']:
    user_model_id = user_real_model[user_id]  # encoding user_id
    predict = model.recommend(user_model_id, train_csr, N=20, filter_already_liked_items=True)
    
    predict_decoded = [item_model_real[itm] for itm,score in predict]  # decoding item_id
    watched_history = user_watched.iloc[user_id]
    predict_decoded = array_difference(predict_decoded, watched_history)  # filter already watched

    need_more = N_recommendations - len(predict_decoded)
    if need_more > 0:
        more_items = array_difference(top_60, predict_decoded)  # filter already recommended
        more_items = array_difference(more_items, watched_history)  # filter already watched
        predict_decoded.extend(more_items[:need_more])
        
    predict_decoded.append(user_id)  # add user_id column
    recommendations.append(predict_decoded)

In [16]:
final_recs = pd.DataFrame(recommendations, columns=column_names)

0.5452 on public

In [17]:
final_recs.to_csv('submit_02.csv', index=False)