In [4]:
import pandas as pd
import numpy as np

In [5]:
from tqdm import tqdm
import pickle
from collections import Counter

In [6]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [127]:
sample = pd.read_csv("sample_submission.csv")
interactions = pd.read_csv("interactions.csv")
items = pd.read_csv("items.csv")
users = pd.read_csv("users.csv")

In [5]:
interactions['start_date'] = pd.to_datetime(interactions['start_date'])
users.fillna(0,inplace=True)
#interactions.start_date.max() - pd.DateOffset(30)


In [73]:
user_ind = np.array([])
for i in sample.Id:
    user_ind = np.append(user_ind,np.where(interactions.user_id==i)[0])
merged = (interactions.iloc[user_ind].merge(items,left_on='item_id',right_on='id'))
int_item = (interactions.merge(items,left_on='item_id',right_on='id'))
int_user = interactions.merge(users,on='user_id')
int_user = int_user.merge(items,left_on='item_id',right_on='id')
int_item.authors.fillna('nan',inplace=True)

In [7]:
def count_genres_authors(data):
    counter_genre = Counter()
    counter_author = Counter()
    for i in tqdm(range(data.shape[0])):
        genres = str(data.iloc[i].genres).split(',')
        authors = str(data.iloc[i].authors).split(',')
        counter_genre.update(genres)
        counter_author.update(authors)
    return counter_genre,counter_author

In [8]:
counter_genre,counter_author = count_genres_authors(items)

100%|██████████| 63758/63758 [00:15<00:00, 4198.14it/s]


In [9]:
def make_author_genre(counter_genre,counter_author,data):
    author_genre = pd.DataFrame(0,index=list(counter_author.keys()),columns=list(counter_genre.keys()))
    for i in tqdm(range(data.shape[0])):
        genres = str(data.iloc[i].genres).split(',')
        authors = str(data.iloc[i].authors).split(',')
        for genre in genres:
            for author in authors:
                author_genre.loc[author,genre] += 1
    return author_genre

In [52]:
author_genre = make_author_genre(counter_genre,counter_author,merged)

100%|██████████| 91642/91642 [01:14<00:00, 1226.01it/s]


In [53]:
author_genre.to_csv('author_genre.csv')

In [56]:
author_genre = pd.read_csv('author_genre.csv',index_col=0)

In [112]:
%%time
user_list_less_8 = []
user_list_more_8 = []
for i,user in enumerate(sample.Id.values):
    if merged.loc[merged.user_id==user].shape[0]<10:
        user_list_less_8.append(i)
    else:
        user_list_more_8.append(i)

CPU times: user 1.49 s, sys: 8.35 ms, total: 1.5 s
Wall time: 1.5 s


In [13]:
len(user_list_less_8),len(user_list_more_8)

(875, 2199)

In [14]:
def find_users_attr(user):
    counter_genre = Counter()
    counter_author = Counter()
    row = merged.loc[merged.user_id==user].sort_values(by='start_date')
    
    for i in row.genres.to_list():
        counter_genre.update(i.split(','))
    
    for i in row.authors.to_list():
        counter_author.update(str(i).split(','))
        
    top1_genre = counter_genre.most_common(2)[0][0]
    top2_genre = counter_genre.most_common(2)[1][0]
    
    top1_author = counter_author.most_common(1)[0][0]
    if len(counter_author)==sum(counter_author.values()):
        top1_author = str(row.iloc[-1].authors).split(',')[0]
    
    last1_author = str(row.iloc[-1].authors).split(',')[0]
    
    if len(counter_author)==1:
        last2_author = str(row.iloc[-1].authors).split(',')[0]
        return top1_genre,top2_genre,top1_author,last1_author,last2_author
    
    last2_author = str(row.iloc[-2].authors).split(',')[0]
    
    return top1_genre,top2_genre,top1_author,last1_author,last2_author

In [15]:
male_top_15 = int_user.loc[int_user.start_date>'2019-12-01'].loc[int_user.sex==0].\
    groupby('item_id')['item_id'].count().sort_values(ascending=False).head(15).index.to_list()
female_top_15 = int_user.loc[int_user.start_date>'2019-12-01'].loc[int_user.sex==1].\
    groupby('item_id')['item_id'].count().sort_values(ascending=False).head(15).index.to_list()
all_top_15 = int_user.loc[int_user.start_date>'2019-12-01'].\
    groupby('item_id')['item_id'].count().sort_values(ascending=False).head(15).index.to_list()

In [16]:
def recommend_n_popular(n,user,user_activity,pred):
    user_items = user_activity.item_id.values
    gender = users.loc[users.user_id==user].sex.values
    prediction = []
    
    if len(gender) == 0:
        for item in all_top_15:
            if len(prediction)==n:
                break
            if item not in user_items and item not in pred:
                prediction.append(item)  
                
    elif gender == 0:
        for item in male_top_15:
            if len(prediction)==n:
                break
            if item not in user_items and item not in pred:
                prediction.append(item) 
                
    elif gender == 1:
        for item in female_top_15:
            if len(prediction)==n:
                break
            if item not in user_items and item not in pred:
                prediction.append(item)
                
    return pred + prediction

In [23]:
def make_author_most_pop(data,gender=None):
    author_most_pop = dict()
    
    if gender is None:
        for author in tqdm(list(counter_author.keys())):
            author_most_pop[author] = data.loc[data.authors == author].\
                        groupby('item_id')['item_id'].\
                        count().sort_values(ascending=False).index.to_list()
        return author_most_pop
    
    for author in tqdm(list(counter_author.keys())):
        author_most_pop[author] = data.loc[(data.sex==gender)&(data.authors == author)].\
                        groupby('item_id')['item_id'].\
                        count().sort_values(ascending=False).index.to_list()
         
    return author_most_pop

In [70]:
author_most_pop = make_author_most_pop(int_item)

100%|██████████| 18950/18950 [23:41<00:00, 13.33it/s]


In [74]:
author_most_pop_male = make_author_most_pop(int_user,0)

100%|██████████| 18950/18950 [19:16<00:00, 16.39it/s]


In [77]:
author_most_pop_female = make_author_most_pop(int_user,1)

100%|██████████| 18950/18950 [19:19<00:00, 16.34it/s]


In [71]:
save_obj(author_most_pop,'author_most_pop')

In [75]:
save_obj(author_most_pop_male,'author_most_pop_male')

In [78]:
save_obj(author_most_pop_female,'author_most_pop_female')

In [113]:
for user in tqdm(sample.iloc[user_list_less_8].Id.values):
    pred = recommend_n_popular(10,user,merged.loc[merged.user_id==user],[])
    answ = " ".join([str(i) for i in pred])
    sample.loc[sample.Id==user,'Predicted'] = answ

100%|██████████| 1977/1977 [00:03<00:00, 576.54it/s]


In [60]:
def add_item_to_pred(items,user_items,pred):
    for item in items:
        if (item not in pred) and (item not in user_items):
            pred.append(item)
            break
    return pred

In [114]:
for user in tqdm(sample.iloc[user_list_more_8].Id.values):
    pred = []
    gender = users.loc[users.user_id==user].sex.values
    top1_genre,top2_genre,top1_author,last1_author,last2_author = find_users_attr(user)
    user_activity = merged.loc[merged.user_id==user]
    user_items = user_activity.item_id.to_list()
    
    if top1_genre=='nan':
        top1_genre = str(top2_genre)
    if top2_genre=='nan':
        top2_genre = str(top1_genre)
    
    if len(gender)==0:
        most_pop = author_most_pop
    elif gender==0:
        most_pop=author_most_pop_male
    elif gender==1:
        most_pop=author_most_pop_female
    
    
    authors_items = most_pop[last1_author]
    pred = add_item_to_pred(authors_items,user_items,pred)
    pred = add_item_to_pred(authors_items,user_items,pred)
    pred = add_item_to_pred(authors_items,user_items,pred)
    
    authors_items = most_pop[last2_author]
    pred = add_item_to_pred(authors_items,user_items,pred)
    pred = add_item_to_pred(authors_items,user_items,pred)
    
    authors_items = most_pop[top1_author]
    pred = add_item_to_pred(authors_items,user_items,pred)
            
    top_genre_author = author_genre.loc[:,top1_genre].sort_values(ascending=False).head(1).index[0]
    if str(top_genre_author) =='nan':
        authors_items = author_most_pop[str(top_genre_author)]
    else :
        authors_items = most_pop[top_genre_author]
    pred = add_item_to_pred(authors_items,user_items,pred)
            
    top_genre_author = author_genre.loc[:,top1_genre].sort_values(ascending=False).head(2).index[1]
    if str(top_genre_author) =='nan':
        authors_items = author_most_pop[str(top_genre_author)]
    else :
        authors_items = most_pop[top_genre_author]
    
    pred = add_item_to_pred(authors_items,user_items,pred)
    
    
    if str(top_genre_author) =='nan':
        authors_items = author_most_pop[str(last1_author)]
    else :
        authors_items = most_pop[last1_author]
    pred = add_item_to_pred(authors_items,user_items,pred)
    
    
    
    top_genre_author = author_genre.loc[:,top2_genre].sort_values(ascending=False).head(1).index[0]
    if str(top_genre_author) =='nan':
        authors_items = author_most_pop[str(top_genre_author)]
    else :
        authors_items = most_pop[top_genre_author]
    pred = add_item_to_pred(authors_items,user_items,pred)
            
    top_genre_author = author_genre.loc[:,top2_genre].sort_values(ascending=False).head(2).index[1]
    if str(top_genre_author) =='nan':
        authors_items = author_most_pop[str(top_genre_author)]
    else :
        authors_items = most_pop[top_genre_author]
    pred = add_item_to_pred(authors_items,user_items,pred)
            
    top_genre_author = author_genre.loc[:,top1_genre].sort_values(ascending=False).head(3).index[2]
    if str(top_genre_author) =='nan':
        authors_items = author_most_pop[str(top_genre_author)]
    else :
        authors_items = most_pop[top_genre_author]
    pred = add_item_to_pred(authors_items,user_items,pred)   
    
    if str(top_genre_author) =='nan':
        authors_items = author_most_pop[str(top1_author)]
    else :
        authors_items = most_pop[top1_author]
    
    pred = add_item_to_pred(authors_items,user_items,pred)
    
    if len(pred)!=10:
        authors_items = most_pop[last1_author]
        pred = add_item_to_pred(authors_items,user_items,pred)
            
    if len(pred)!=10:
        top_genre_author = author_genre.loc[:,top1_genre].sort_values(ascending=False).head(2).index[0]
        if str(top_genre_author) =='nan':
            authors_items = author_most_pop[str(top_genre_author)]
        else :
            authors_items = most_pop[top_genre_author]
        pred = add_item_to_pred(authors_items,user_items,pred)
                           
    if len(pred)!=10:
        second_popular = user_activity.groupby('authors')['authors'].count().\
                        sort_values(ascending=False).index.to_list()[1].split(',')[0]
        authors_items = most_pop[second_popular]
        pred = add_item_to_pred(authors_items,user_items,pred)
                
    if len(pred)!=10:
        top_genre_author = author_genre.loc[:,top1_genre].sort_values(ascending=False).head(3).index[2]
        if str(top_genre_author) =='nan':
            authors_items = author_most_pop[str(top_genre_author)]
        else :
            authors_items = most_pop[top_genre_author]
        pred = add_item_to_pred(authors_items,user_items,pred)
                
    if len(pred)!=10:
        top_genre_author = author_genre.loc[:,top2_genre].sort_values(ascending=False).head(3).index[2]
        if str(top_genre_author) =='nan':
            authors_items = author_most_pop[str(top_genre_author)]
        else :
            authors_items = most_pop[top_genre_author]
        pred = add_item_to_pred(authors_items,user_items,pred)
    
    if len(pred)!=10:
        top_genre_author = author_genre.loc[:,top1_genre].sort_values(ascending=False).head(4).index[3]
        if str(top_genre_author) =='nan':
            authors_items = author_most_pop[str(top_genre_author)]
        else :
            authors_items = most_pop[top_genre_author]
        pred = add_item_to_pred(authors_items,user_items,pred)
                
    if len(pred)!=10:   
        pred.append(all_top_15[0])
        
    answ = " ".join([str(i) for i in pred])
    sample.loc[sample.Id==user,'Predicted'] = answ

100%|██████████| 1097/1097 [00:15<00:00, 70.30it/s]


In [115]:
sample.to_csv('content_based_v2.csv',index=False)