### Hybrid Popularity Model for Solving Cold-start Problem in Recommendation System
---
Based on: https://dl.acm.org/doi/pdf/10.1145/3427423.3427425

In [6]:
import pandas as pd
import numpy as np

# from kmodes.kmodes import KModes
# from fcmeans import FCM

ModuleNotFoundError: No module named 'pandas'

## Toy Example
---

In [None]:
# Table 1. Toy example of rating data
df = pd.read_csv('data/toy_example.csv', index_col=0)
zero_mask = df == 0
df

In [None]:
# Table 2. Toy example of item popularity model 𝑰𝑰
ip = df.sum(axis=0)
max_ip = max(ip)
ip_norm = ip / max_ip

ip_df = pd.DataFrame([ip, ip_norm], index=["Item Popularity", "Item Popularity (Normalized)"])
ip_df

In [None]:
# Table 3. Toy example of user popularity model
up = df.sum(axis=1)
max_up = max(up)
up_norm = up / max_up

up_df = pd.DataFrame([up, up_norm], index=["Item Popularity", "Item Popularity (Normalized)"])
up_df.T

In [None]:
# Table 4. Toy example of user-item popularity model
xv, yv = np.meshgrid(up_norm, ip_norm, indexing='ij')
uip = xv / yv
uip[zero_mask] = 0
uip = pd.DataFrame(uip, columns=df.columns, index=df.index)
uip

In [None]:
# Table 5. Toy example of hybrid popularity model 
alpha = 0.5

hp = alpha * ip_norm + (1 - alpha) * uip[:3].sum()

hp_df = pd.DataFrame([hp], index=["Hybrid Popularity"]).T
hp_df.sort_values("Hybrid Popularity", ascending=False)

# 5.1 Dataset and Experiment Setup
---
- Source: https://grouplens.org/datasets/movielens/100k/

In [115]:
# Ratings
raw = pd.read_csv('data/raw/u.data', delimiter='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings = pd.pivot_table(raw, index='user_id', columns=['item_id'], values='rating')
ratings = ratings.fillna(0)
print(ratings.shape)
ratings.head()

(943, 1682)


item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
all_ratings = pd.pivot_table(raw, index="user_id", columns="item_id", values="rating")

In [118]:
# Split train/test
np.random.seed(123)
training_ids = np.random.choice(all_ratings.index, np.int32(937*0.9), replace=False)
train_ratings = all_ratings[all_ratings.index.isin(training_ids)]
test_ratings = all_ratings[~all_ratings.index.isin(training_ids)]

no_rating_mask = train_ratings.isna()
print(test_ratings.shape)

(100, 1682)


In [119]:
# Item popularity
ip = train_ratings.sum(axis=0)
max_ip = max(ip)
ip_norm = ip / max_ip

In [121]:
# Item popularity
up = train_ratings.sum(axis=1)
# up = up.sort_values(ascending=False)[:40]
max_up = max(up)
up_norm = up / max_up

In [122]:
# User Item Popularity
xv, yv = np.meshgrid(up_norm, ip_norm, indexing='ij')
uip = xv / yv
uip = pd.DataFrame(uip, columns=train_ratings.columns, index=train_ratings.index)
uip[no_rating_mask] = 0
uip

  uip = xv / yv


item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.674077,2.768899,4.268263,1.569579,4.218245,12.412306,0.807078,1.372135,1.000807,3.395820,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.157879,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.795355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.345275,1.418285,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.526494,0.000000,0.000000,0.000000,0.000000,0.000000,0.630376,1.071719,0.781689,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.137585,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
938,0.242311,0.000000,0.000000,0.000000,0.000000,0.000000,0.290121,0.000000,0.359761,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.213003,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.061092,0.000000,0.000000,0.000000,0.000000,0.000000,0.073147,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
# HPop
alpha = 1
hp = (alpha * ip_norm) + ((1 - alpha) * uip.sum())

In [125]:
# NDCG@1
N = 20 # number of recommended items
IDCG = np.sum(1 / (np.log2(1+np.arange(1, N+1))))
# Calculated for all users and items
hpop_top_N = hp.sort_values(ascending=False).head(N)
# Top 10 Recommended Items 
hpop_top_N.index

Index([ 50, 100, 181, 258, 127,   1, 174, 286,  98, 288,  56, 300, 294, 172,
       121,   7, 313, 237, 117, 222],
      dtype='int64', name='item_id')

In [126]:
ndcg = []
precision = []
recall = []
for ir, iuser in test_ratings.iterrows():
    hu = iuser[iuser.notna()].index
    nominator = len(set(hpop_top_N.index).intersection(set(hu)))
    # precision
    precision.append(nominator / N)
    # recall
    recall.append(nominator / len(hu))
    # DCGu(N)
    dcg_u = np.sum([n in hu for n in hpop_top_N.index] / (np.log2(np.arange(N)+2)))
    ndcg.append(dcg_u / IDCG)

In [127]:
100*np.mean(recall)

13.374884134306148

In [128]:
100*np.mean(precision)

42.4

In [129]:
np.mean(ndcg)

0.4430790147814235

In [None]:
# NDCG@1
dict = 
for alpha in [1,0.9,0]:
    hp = (alpha * ip_norm) + ((1 - alpha) * uip.sum())
    ndcg_mean = []
    precision_mean = []
    recall_mean = []
    for N in range(1,21):
        IDCG = np.sum(1 / (np.log2(1+np.arange(1, N+1))))
        # Calculated for all users and items
        hpop_top_N = hp.sort_values(ascending=False).head(N)
        # Top 10 Recommended Items 
        hpop_top_N.index
        ndcg = []
        precision = []
        recall = []
        for ir, iuser in test_ratings.iterrows():
            hu = iuser[iuser.notna()].index
            nominator = len(set(hpop_top_N.index).intersection(set(hu)))
            # precision
            precision.append(nominator / N)
            # recall
            recall.append(nominator / len(hu))
            # DCGu(N)
            dcg_u = np.sum([n in hu for n in hpop_top_N.index] / (np.log2(np.arange(N)+2)))
            ndcg.append(dcg_u / IDCG)
        ndcg_mean.append(100 * np.mean(ndcg))
        precision_mean.append(100 * np.mean(precision))
        recall_mean.append(100 * np.mean(recall))

In [3]:
z = {'k':[1,2,3]}
z['k'].append([1,1,1])
z

{'k': [1, 2, 3, [1, 1, 1]]}

In [5]:
np.mean(z['k'])

NameError: name 'np' is not defined