In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
train_zvuk = pd.read_parquet('train_zvuk.parquet')
train_smm = pd.read_parquet('train_smm.parquet')

In [3]:
def process_datasets(train_smm, train_zvuk):
    train_smm['timestamp'] -= 1673740803033000000
    train_zvuk['timestamp'] -= 1673740803033000000
    train_smm['timestamp'] /= 10**6
    train_zvuk['timestamp'] /= 10**6

    mean_user_rating_smm = train_smm.groupby('user_id')['rating'].mean().reset_index()
    mean_user_rating_smm.rename(columns={'rating': 'mean_user_rating_smm'}, inplace=True)

    mean_user_rating_zvuk = train_zvuk.groupby('user_id')['rating'].mean().reset_index()
    mean_user_rating_zvuk.rename(columns={'rating': 'mean_user_rating_zvuk'}, inplace=True)

    user_meta = pd.merge(mean_user_rating_smm, mean_user_rating_zvuk, on='user_id', how='outer')

    user_meta['mean_user_rating_smm'] = user_meta['mean_user_rating_smm'].fillna(user_meta['mean_user_rating_smm'].mean())
    user_meta['mean_user_rating_zvuk'] = user_meta['mean_user_rating_zvuk'].fillna(user_meta['mean_user_rating_zvuk'].mean())

    mean_item_rating_smm = train_smm.groupby('item_id')['rating'].mean().reset_index()
    mean_item_rating_smm.rename(columns={'rating': 'mean_item_rating'}, inplace=True)
    
    mean_item_rating_zvuk = train_zvuk.groupby('item_id')['rating'].mean().reset_index()
    mean_item_rating_zvuk.rename(columns={'rating': 'mean_item_rating'}, inplace=True)

    item_meta_zvuk = mean_item_rating_zvuk
    item_meta_smm = mean_item_rating_smm

    item_meta_zvuk['mean_item_rating'] = item_meta_zvuk['mean_item_rating'].fillna(item_meta_zvuk['mean_item_rating'].mean())
    item_meta_smm['mean_item_rating'] = item_meta_smm['mean_item_rating'].fillna(item_meta_smm['mean_item_rating'].mean())

    train_smm.sort_values(by='timestamp', inplace=True)
    train_zvuk.sort_values(by='timestamp', inplace=True)

    return user_meta, item_meta_zvuk, item_meta_smm

In [4]:
test_zvuk = pd.read_parquet("test_zvuk.parquet")
test_smm = pd.read_parquet("test_smm.parquet")

In [5]:
user_meta, item_meta_zvuk, item_meta_smm = process_datasets(train_smm, train_zvuk)

In [6]:
train_smm = train_smm[train_smm['user_id'].isin(test_smm['user_id'])]
train_zvuk = train_zvuk[train_zvuk['user_id'].isin(test_zvuk['user_id'])]

In [7]:
groups_smm = train_smm.groupby('user_id').size().to_frame('size')['size'].to_numpy()
groups_zvuk = train_zvuk.groupby('user_id').size().to_frame('size')['size'].to_numpy()

In [8]:
train_smm = train_smm.merge(user_meta, on='user_id', how='left')
train_smm = train_smm.merge(item_meta_smm, on='item_id', how='left')
train_smm = train_smm[["user_id", "mean_user_rating_smm", "mean_item_rating", "rating"]].rename(columns={"mean_user_rating_smm": "mean_user_rating"})

train_zvuk = train_zvuk.merge(user_meta, on='user_id', how='left')
train_zvuk = train_zvuk.merge(item_meta_zvuk, on='item_id', how='left')
train_zvuk = train_zvuk[["user_id", "mean_user_rating_zvuk", "mean_item_rating", "rating"]].rename(columns={"mean_user_rating_zvuk": "mean_user_rating"})

In [9]:
model_smm = xgb.XGBRanker(
    tree_method='hist',
    booster='gbtree',
    objective='rank:pairwise',
    n_estimators=5,
)

model_zvuk = xgb.XGBRanker(
    tree_method='hist',
    booster='gbtree',
    objective='rank:pairwise',
    n_estimators=5,
)

model_smm.fit(train_smm[['mean_user_rating', 'mean_item_rating']], train_smm['rating'], group=groups_smm, verbose=True)
model_zvuk.fit(train_zvuk[['mean_user_rating', 'mean_item_rating']], train_zvuk['rating'], group=groups_zvuk, verbose=True)

In [None]:
sub_smm = pd.read_parquet("submission_smm_als.parquet")
sub_zvuk = pd.read_parquet("submission_zvuk_als.parquet")

In [11]:
def explode_item_ids(df):
    df = df.explode('item_id')
    return df

sub_smm = explode_item_ids(sub_smm)
sub_zvuk = explode_item_ids(sub_zvuk)

In [12]:
del train_smm
del train_zvuk

In [13]:
sub_smm = sub_smm.merge(user_meta, on='user_id', how='left')
sub_smm = sub_smm.merge(item_meta_smm, on='item_id', how='left')
sub_smm = sub_smm[["user_id", "mean_user_rating_smm", "mean_item_rating"]].rename(columns={"mean_user_rating_smm": "mean_user_rating"})

sub_zvuk = sub_zvuk.merge(user_meta, on='user_id', how='left')
sub_zvuk = sub_zvuk.merge(item_meta_zvuk, on='item_id', how='left')
sub_zvuk = sub_zvuk[["user_id", "mean_user_rating_zvuk", "mean_item_rating"]].rename(columns={"mean_user_rating_zvuk": "mean_user_rating"})

In [14]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['user_id'])])
  
predictions_smm = (sub_smm.groupby('user_id')
               .apply(lambda x: predict(model_smm, x)))

predictions_zvuk = (sub_zvuk.groupby('user_id')
               .apply(lambda x: predict(model_zvuk, x)))

  .apply(lambda x: predict(model_smm, x)))
  .apply(lambda x: predict(model_zvuk, x)))


In [15]:
df_predictions_zvuk = pd.DataFrame({'user_id':predictions_zvuk.index, 'predictions':predictions_zvuk.values})
df_predictions_smm = pd.DataFrame({'user_id':predictions_smm.index, 'predictions':predictions_smm.values})

In [None]:
sub_smm_final = pd.read_parquet('submission_smm_als.parquet')
sub_zvuk_final = pd.read_parquet('submission_zvuk_als.parquet')

In [None]:
def sort_item_ids_zvuk(row, df):
    user_id = row['user_id']
    predictions = df_predictions_zvuk[df_predictions_zvuk['user_id'] == user_id]['predictions'].values
    
    predictions = predictions[0]
    item_ids = row['item_id']
    
    paired_list = sorted(zip(predictions, item_ids), key=lambda x: x[0], reverse=True)
    
    return [item_id for _, item_id in paired_list]


def sort_item_ids_smm(row):
    user_id = row['user_id']
    predictions = df_predictions_smm[df_predictions_smm['user_id'] == user_id]['predictions'].values
    
    predictions = predictions[0]
    item_ids = row['item_id']
    
    paired_list = sorted(zip(predictions, item_ids), key=lambda x: x[0], reverse=True)

    return [item_id for _, item_id in paired_list]


sub_zvuk_final1 = sub_zvuk_final
sub_zvuk_final1['sorted_item_id'] = sub_zvuk_final.apply(sort_item_ids_zvuk, axis=1)
result_df_zvuk = sub_zvuk_final1[['user_id', 'sorted_item_id']]

sub_smm_final1 = sub_smm_final
sub_smm_final1['sorted_item_id'] = sub_smm_final.apply(sort_item_ids_smm, axis=1)
result_df_smm = sub_smm_final1[['user_id', 'sorted_item_id']]

In [18]:
result_df_zvuk = result_df_zvuk.rename(columns={"sorted_item_id":"item_id"})
result_df_smm = result_df_smm.rename(columns={"sorted_item_id":"item_id"})

In [19]:
result_df_smm.to_parquet('submission_smm.parquet', index=False)
result_df_zvuk.to_parquet('submission_zvuk.parquet', index=False)