# Learning to rank with XGB

## Import Libraries

In [91]:
import pandas as pd
import numpy as np
from xgboost import XGBRanker
from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

## Data Formatting

### Import Data

In [92]:
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')
df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_ratings['timestamp'] = df_ratings['timestamp'].map(lambda x: datetime.fromtimestamp(x))

### Split Data
The data will be split considering time where most recent data will be using for testing and the least recent will beb used for training. 

In [93]:
start = min(df_ratings['timestamp'])
end = max(df_ratings['timestamp'])
interval = end - start

In [94]:
df_ratings['rating'] = df_ratings['rating'].apply(lambda x:int(np.ceil(x)))
train = df_ratings[df_ratings['timestamp'] <= (end - interval/3)]
test = df_ratings[df_ratings['timestamp'] >= (start + interval/3)]


In [95]:
train_y = train[train['timestamp'] >= (start + interval/3)]
train_X = train[train['timestamp'] < (start + interval/3)]
train_tgt_user = set(train_X['userId']) & set(train_y['userId'])

In [96]:

test_y = test[test['timestamp'] >= (end - interval/3)]
test_X = test[test['timestamp'] < (end - interval/3)]
test_tgt_user = set(test_X['userId']) & set(test_y['userId'])

#### Create user-level features

In [97]:
def get_feature_by_user(df):
    """Return a dataset with consolidated metrics for each movie they being :
        - Total registers by score
        - Total registers by day of the week"""
    res = list()
    for i, v in tqdm(df.groupby('userId')):
        res.append(
            (
                i,
                len(v['movieId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()

            )
        )
    
    res = pd.DataFrame(
        res,
        columns=[
            'userId', 'revired_products', '5_star_ratings_gave', '4_star_ratings_gave',
            '3_star_ratings_gave', '2_star_ratings_gave', '1_star_ratings_gave',
            'monday_review_count_user', 'tuesday_review_count_user', 'wednesday_review_count_user', 'thursday_review_count_user',
            'friday_review_count_user', 'saturday_review_count_user', 'sunday_review_count_user','evening_reviews_by_user'
        ])
    return res

#### Create movie-level features

In [98]:
def get_feature_by_product(df):
    """Return a dataset with consolidated metrics for each movie they being :
        - Total registers by score
        - Total registers by day of the week"""
    res = list()
    for i, v in tqdm(df.groupby('movieId')):
        res.append(
            (
                i,
                len(v['userId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()
            )
        )
    
    res = pd.DataFrame(
        res,
        columns=[
            'movieId', 'user_count', '1_star_ratings_recieved', '2_star_ratings_recieved',
            '3_star_ratings_recieved', '4_star_ratings_recieved', '5_star_ratings_recieved',
            'monday_review_count_item', 'tuesday_review_count_item', 'wednesday_review_count_item', 'thursday_review_count_item',
            'friday_review_count_item', 'saturday_review_count_item', 'sunday_review_count_item','evening_reviews_by_movie'
        ])
    return res

#### Generate movie and user level features for training

In [99]:
train_X_user = get_feature_by_user(train_X)
train_X_product = get_feature_by_product(train_X)

100%|██████████| 261/261 [00:00<00:00, 539.05it/s]
100%|██████████| 3758/3758 [00:07<00:00, 472.13it/s]


In [100]:
test_X_user = get_feature_by_user(test_X)
test_X_product = get_feature_by_product(test_X)

100%|██████████| 161/161 [00:00<00:00, 523.18it/s]
100%|██████████| 5418/5418 [00:11<00:00, 464.31it/s]


#### Generate Model Input based on generated features

In [101]:
def get_model_input(X_u, X_m, y, tgt_users):

    merged = pd.merge(X_u, y, on=['userId'], how='inner')
    merged = pd.merge(X_m, merged, on=['movieId'], how='outer')
    merged = merged.query('userId in @tgt_users')

    merged.fillna(0, inplace=True)
    features_cols = list(merged.drop(columns=['userId', 'movieId', 'rating', 'timestamp']).columns)

    query_list = merged['userId'].value_counts()

    merged = merged.set_index(['userId', 'movieId'])

    query_list = query_list.sort_index()

    merged.sort_index(inplace=True)

    df_x = merged[features_cols]

    df_y = merged['rating']
    
    return df_x, df_y, query_list

In [102]:
X_train, y_train, query_list_train = get_model_input(train_X_user, train_X_product, train_y, train_tgt_user)
X_test, y_test, query_list_test = get_model_input(test_X_user, test_X_product, test_y, test_tgt_user)

In [103]:
xgb_rkr = XGBRanker(objective='rank:ndcg', n_estimators=100, random_state=0,learning_rate=0.1)

In [104]:
xgb_rkr.fit(
    X_train,
    y_train,
    group=query_list_train,
    eval_metric='ndcg',
    eval_set=[(X_test, y_test)],
    eval_group=[list(query_list_test)],
    verbose =True
)

[0]	validation_0-ndcg:0.72758
[1]	validation_0-ndcg:0.72499
[2]	validation_0-ndcg:0.71783
[3]	validation_0-ndcg:0.71133
[4]	validation_0-ndcg:0.70967
[5]	validation_0-ndcg:0.71229
[6]	validation_0-ndcg:0.71199
[7]	validation_0-ndcg:0.71154
[8]	validation_0-ndcg:0.70980
[9]	validation_0-ndcg:0.70963
[10]	validation_0-ndcg:0.71877
[11]	validation_0-ndcg:0.72255
[12]	validation_0-ndcg:0.72148
[13]	validation_0-ndcg:0.72261
[14]	validation_0-ndcg:0.71671
[15]	validation_0-ndcg:0.71817
[16]	validation_0-ndcg:0.71959
[17]	validation_0-ndcg:0.71686
[18]	validation_0-ndcg:0.72419
[19]	validation_0-ndcg:0.72063
[20]	validation_0-ndcg:0.72228
[21]	validation_0-ndcg:0.72155
[22]	validation_0-ndcg:0.72104
[23]	validation_0-ndcg:0.72179
[24]	validation_0-ndcg:0.71951
[25]	validation_0-ndcg:0.72021
[26]	validation_0-ndcg:0.71810
[27]	validation_0-ndcg:0.71836
[28]	validation_0-ndcg:0.71702
[29]	validation_0-ndcg:0.71761
[30]	validation_0-ndcg:0.71988
[31]	validation_0-ndcg:0.71761
[32]	validation_0-

### Predict Ranking for user 
Given a list of movies for test rank them for their relative  user trained in the model

In [105]:
def predict_at_k(data, model, k):
    user_ids = list()
    product_ids = list()
    ranks = list()
    
    for userId, df in data.groupby('userId'):
        
        pred = model.predict(df.loc[userId])
        productId = np.array(df.reset_index()['movieId'])
        topK_index = np.argsort(pred)[::-1][:k]
        product_ids.extend(list(productId[topK_index]))
        user_ids.extend([userId]*len(topK_index))
        ranks.extend(list(range(1, len(topK_index)+1)))

    results = pd.DataFrame({'userId': user_ids, 'movieId': product_ids, 'rank': ranks})
    
    return results

In [106]:
predicted = predict_at_k(X_test, xgb_rkr, 5)
print(predicted.head(20))

    userId  movieId  rank
0     15.0     5952     1
1     15.0     4720     2
2     15.0      356     3
3     15.0     2858     4
4     15.0     2329     5
5     68.0    54259     1
6     68.0      527     2
7     68.0      141     3
8     68.0    79132     4
9     68.0    78499     5
10   104.0    56367     1
11   104.0    37729     2
12   104.0    54259     3
13   104.0    76251     4
14   104.0     6773     5
15   132.0      593     1
16   132.0    48516     2
17   132.0     3275     3
18   132.0    63082     4
19   132.0     4973     5
