In [1]:
!pip install implicit



In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import ndcg_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('hse_train.csv')
train.head()

Unnamed: 0,user_id,item_id,timestamp
0,258671,74254,1511701649
1,258671,115615,1511841435
2,258671,176624,1512105022
3,240498,45484,1511605442
4,240498,39504,1511756830


In [3]:
train.shape

(4842338, 3)

In [4]:
train.timestamp = pd.to_datetime(train["timestamp"], unit='s')

In [5]:
train.timestamp.min(), train.timestamp.max()

(Timestamp('2017-11-24 16:00:00'), Timestamp('2017-12-02 23:59:59'))

In [6]:
max_timestamp = train['timestamp'].max()
train['days_ago'] = (max_timestamp - train['timestamp']).dt.days
train.head()

Unnamed: 0,user_id,item_id,timestamp,days_ago
0,258671,74254,2017-11-26 13:07:29,6
1,258671,115615,2017-11-28 03:57:15,4
2,258671,176624,2017-12-01 05:10:22,1
3,240498,45484,2017-11-25 10:24:02,7
4,240498,39504,2017-11-27 04:27:10,5


То есть собраны данные за неделю

In [7]:
ALL_USERS = train['user_id'].unique().tolist()
ALL_ITEMS = train['item_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

train['user_ids'] = train['user_id'].map(user_map)
train['item_ids'] = train['item_id'].map(item_map)

In [8]:
row = train['user_ids'].values
col = train['item_ids'].values
data = np.ones(train.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))

In [9]:
def to_user_item_coo(df, shape):
    """ Turn a dataframe with transactions into a COO sparse users x items matrix"""
    row = df['user_ids'].values
    col = df['item_ids'].values
    data = np.ones(df.shape[0])
    return coo_matrix((data, (row, col)), shape=shape)


def split_data(df, validation_days=1):
    """ Split a pandas dataframe into training and validation data, using <<validation_days>>
    """
    df_train = df[df['days_ago'] > validation_days]
    df_val = df[df['days_ago'] <= validation_days]
    print(f"Train shape: {df_train.shape} - Validation shape: {df_val.shape}")
    print(f"Train users: {df_train['user_ids'].nunique()} - Validation users: {df_val['user_ids'].nunique()}")
    print(f"Train items: {df_train['item_ids'].nunique()} - Validation items: {df_val['item_ids'].nunique()}")
    print(f"Validation days: {df_val['timestamp'].min()} - {df_val['timestamp'].max()}")
    print(f"Train days: {df_train['timestamp'].min()} - {df_train['timestamp'].max()}")
    return df_train, df_val

def get_val_matrices(df, validation_days=1):
    df_train, df_val = split_data(df, validation_days=validation_days)
    
    # Оставляем в df_val только пользователей, которые были в df_train
    train_users = set(df_train['user_ids'].unique())
    df_val = df_val[df_val['user_ids'].isin(train_users)]
    
    shape = (len(ALL_USERS), len(ALL_ITEMS))
    
    coo_train = to_user_item_coo(df_train, shape=shape)
    coo_val = to_user_item_coo(df_val, shape=shape)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {
        'coo_train': coo_train,
        'csr_train': csr_train,
        'csr_val': csr_val
    }


def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with NDCG@20
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = AlternatingLeastSquares(factors=factors, 
                                  iterations=iterations, 
                                  regularization=regularization, 
                                  random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    ndcg20 = ndcg_at_k(model, csr_train, csr_val, K=20, show_progress=show_progress)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> NDCG@20: {ndcg20}")
    return ndcg20

In [10]:
matrices = get_val_matrices(train, validation_days=1)

Train shape: (3783404, 6) - Validation shape: (1058934, 6)
Train users: 667411 - Validation users: 329948
Train items: 179795 - Validation items: 167634
Validation days: 2017-12-01 00:00:00 - 2017-12-02 23:59:59
Train days: 2017-11-24 16:00:00 - 2017-11-30 23:59:59


In [11]:
import warnings
warnings.simplefilter(action='ignore')

In [12]:
%%time
best_ndcg20 = 0
regularization = 0.01
for factors in [50, 100, 500]:
    for iterations in [10, 15, 20]:
        ndcg20 = validate(matrices, factors, iterations, regularization, show_progress=True)
        if ndcg20 > best_ndcg20:
            best_ndcg20 = ndcg20
            best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
            print(f"Best NDCG@20 found. Updating: {best_params}")

100%|██████████| 10/10 [07:12<00:00, 43.22s/it]
100%|██████████| 295378/295378 [04:30<00:00, 1093.40it/s]


Factors:  50 - Iterations: 10 - Regularization: 0.010 ==> NDCG@20: 0.0034024333193364395
Best NDCG@20 found. Updating: {'factors': 50, 'iterations': 10, 'regularization': 0.01}


100%|██████████| 15/15 [10:21<00:00, 41.43s/it]
100%|██████████| 295378/295378 [04:15<00:00, 1155.65it/s]


Factors:  50 - Iterations: 15 - Regularization: 0.010 ==> NDCG@20: 0.003435213064699296
Best NDCG@20 found. Updating: {'factors': 50, 'iterations': 15, 'regularization': 0.01}


100%|██████████| 20/20 [14:08<00:00, 42.45s/it]
100%|██████████| 295378/295378 [04:37<00:00, 1065.21it/s]


Factors:  50 - Iterations: 20 - Regularization: 0.010 ==> NDCG@20: 0.0034199797412224307


100%|██████████| 10/10 [07:52<00:00, 47.22s/it]
100%|██████████| 295378/295378 [05:06<00:00, 962.68it/s] 


Factors: 100 - Iterations: 10 - Regularization: 0.010 ==> NDCG@20: 0.0035270107097415703
Best NDCG@20 found. Updating: {'factors': 100, 'iterations': 10, 'regularization': 0.01}


100%|██████████| 15/15 [12:20<00:00, 49.35s/it]
100%|██████████| 295378/295378 [05:06<00:00, 964.35it/s] 


Factors: 100 - Iterations: 15 - Regularization: 0.010 ==> NDCG@20: 0.003568705990196665
Best NDCG@20 found. Updating: {'factors': 100, 'iterations': 15, 'regularization': 0.01}


100%|██████████| 20/20 [16:22<00:00, 49.12s/it]
100%|██████████| 295378/295378 [05:00<00:00, 982.87it/s] 


Factors: 100 - Iterations: 20 - Regularization: 0.010 ==> NDCG@20: 0.003583068982511578
Best NDCG@20 found. Updating: {'factors': 100, 'iterations': 20, 'regularization': 0.01}


100%|██████████| 10/10 [13:38<00:00, 81.89s/it]
100%|██████████| 295378/295378 [15:03<00:00, 327.06it/s]


Factors: 500 - Iterations: 10 - Regularization: 0.010 ==> NDCG@20: 0.00425750949217909
Best NDCG@20 found. Updating: {'factors': 500, 'iterations': 10, 'regularization': 0.01}


100%|██████████| 15/15 [21:09<00:00, 84.61s/it]
100%|██████████| 295378/295378 [14:36<00:00, 336.86it/s]


Factors: 500 - Iterations: 15 - Regularization: 0.010 ==> NDCG@20: 0.004259534714370213
Best NDCG@20 found. Updating: {'factors': 500, 'iterations': 15, 'regularization': 0.01}


100%|██████████| 20/20 [26:19<00:00, 78.97s/it]
100%|██████████| 295378/295378 [12:06<00:00, 406.85it/s]


Factors: 500 - Iterations: 20 - Regularization: 0.010 ==> NDCG@20: 0.004256413105545322
CPU times: total: 18h 56min 40s
Wall time: 3h 20min 12s


# ALS. Бейзлайн 2

In [29]:
train_users = train['user_id'].unique()
train_items = train['item_id'].unique()
max_timestamp = train['timestamp'].max()
train['days_ago'] = (max_timestamp - train['timestamp']).dt.days
train['weight'] = 1 / (1 + train['days_ago'] / 30)

In [30]:
train.head()

Unnamed: 0,user_id,item_id,timestamp,days_ago,weight
0,258671,74254,2017-11-26 13:07:29,6,0.833333
1,258671,115615,2017-11-28 03:57:15,4,0.882353
2,258671,176624,2017-12-01 05:10:22,1,0.967742
3,240498,45484,2017-11-25 10:24:02,7,0.810811
4,240498,39504,2017-11-27 04:27:10,5,0.857143


In [31]:
#Кодируем user_id и item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
train['user_id'] = user_encoder.fit_transform(train['user_id'])
train['item_id'] = item_encoder.fit_transform(train['item_id'])

In [32]:
train['user_id'].nunique(), train['user_id'].max(), train['item_id'].nunique(), train['item_id'].max()

(701981, 701980, 180599, 180598)

In [36]:
interactions = csr_matrix(
    (train['weight'].values, 
     (train['user_id'], train['item_id'])),
    shape=(train['user_id'].max()+1, train['item_id'].max()+1)
)

In [154]:
model = AlternatingLeastSquares(factors=100, iterations=15, regularization=0.01)
model.fit(interactions.T)



  0%|          | 0/15 [00:00<?, ?it/s]

In [64]:
item_counts = train['item_id'].value_counts()
popular_items = item_encoder.transform(item_counts.head(100).index.values)

recommendations = []

for user_id in tqdm(range(interactions.shape[0])):
    # Получаем рекомендации
    recommended = model.recommend(user_id, interactions[user_id], N=20, filter_already_liked_items=True)[0]
    # Преобразование обратно к оригинальным ID
    original_items = item_encoder.inverse_transform(recommended[:20])
    original_user_id = user_encoder.inverse_transform([user_id])[0]
    
    # Сохранение результатов
    for item_id in original_items:
        recommendations.append({'user_id': original_user_id, 'items': item_id})
df_submission = pd.DataFrame(recommendations)

100%|██████████| 701981/701981 [4:01:08<00:00, 48.52it/s]  


In [65]:
df_submission

Unnamed: 0,user_id,items
0,0,137568
1,0,3665
2,0,28020
3,0,102173
4,0,177158
...,...,...
14039615,701980,58593
14039616,701980,171750
14039617,701980,154143
14039618,701980,110938


In [66]:
df_submission.to_csv('als.csv', index=False)