In [None]:
!pip install transformers4rec[pytorch,nvtabular]==0.1.15
!pip install -U nvtabular==1.3.3

In [2]:
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted


class GaussRankScaler(BaseEstimator, TransformerMixin):
    """Transform features by scaling each feature to a normal distribution.
    Parameters
        ----------
        epsilon : float, optional, default 1e-4
            A small amount added to the lower bound or subtracted
            from the upper bound. This value prevents infinite number
            from occurring when applying the inverse error function.
        copy : boolean, optional, default True
            If False, try to avoid a copy and do inplace scaling instead.
            This is not guaranteed to always work inplace; e.g. if the data is
            not a NumPy array, a copy may still be returned.
        n_jobs : int or None, optional, default None
            Number of jobs to run in parallel.
            ``None`` means 1 and ``-1`` means using all processors.
        interp_kind : str or int, optional, default 'linear'
           Specifies the kind of interpolation as a string
            ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
            'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
            refer to a spline interpolation of zeroth, first, second or third
            order; 'previous' and 'next' simply return the previous or next value
            of the point) or as an integer specifying the order of the spline
            interpolator to use.
        interp_copy : bool, optional, default False
            If True, the interpolation function makes internal copies of x and y.
            If False, references to `x` and `y` are used.
        Attributes
        ----------
        interp_func_ : list
            The interpolation function for each feature in the training set.
        """

    def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
        self.epsilon = epsilon
        self.copy = copy
        self.interp_kind = interp_kind
        self.interp_copy = interp_copy
        self.fill_value = 'extrapolate'
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Fit interpolation function to link rank with original data for future scaling
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to fit interpolation function for later scaling along the features axis.
        y
            Ignored
        """
        X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
        return self

    def _fit(self, x):
        x = self.drop_duplicates(x)
        rank = np.argsort(np.argsort(x))
        bound = 1.0 - self.epsilon
        factor = np.max(rank) / 2.0 * bound
        scaled_rank = np.clip(rank / factor - bound, -bound, bound)
        return interp1d(
            x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)

    def transform(self, X, copy=None):
        """Scale the data with the Gauss Rank algorithm
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _transform(self, i, x):
        return erfinv(self.interp_func_[i](x))

    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _inverse_transform(self, i, x):
        inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind,
                                   copy=self.interp_copy, fill_value=self.fill_value)
        return inv_interp_func(erf(x))

    @staticmethod
    def drop_duplicates(x):
        is_unique = np.zeros_like(x, dtype=bool)
        is_unique[np.unique(x, return_index=True)[1]] = True
        return x[is_unique]

In [3]:
import os
import gc 
import datetime
import argparse
import pandas as pd
import numpy as np
import glob
import pytz
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
DATA_PATH = '/kaggle/input/news-portal-user-interactions-by-globocom/'

In [7]:
import shutil
shutil.rmtree("/kaggle/working/clicks_preprocessed")

In [8]:
OUTPUT_PATH = '/kaggle/working/clicks_preprocessed'
!mkdir -p $OUTPUT_PATH 

In [9]:
articles_metadata_df = pd.read_csv(os.path.join(DATA_PATH, 'articles_metadata.csv'))

In [10]:
clicks_dfs_list = []
for idx, clicks_file_path in enumerate(sorted(glob.glob(f"{DATA_PATH}clicks/clicks/*"))): 
    clicks_hour_df = pd.read_csv(clicks_file_path)
    clicks_hour_df['hour_index'] = idx
    clicks_dfs_list.append(clicks_hour_df)

In [11]:
clicks_merged_df = pd.concat(clicks_dfs_list, axis=0)
clicks_merged_df.head()

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,hour_index
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2,0
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2,0
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2,0
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2,0
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2,0


In [12]:
def extract_local_hour_weekday(timestamp_in_utc, local_tz):
    dt = pytz.utc.localize(datetime.datetime.utcfromtimestamp(timestamp_in_utc)).astimezone(pytz.timezone(local_tz))
    return dt.hour + (dt.minute/60.0), dt.weekday()

In [13]:
def get_cicled_feature_value(value, max_value):
    value_scaled = (value + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    
    value_cos = np.cos(2*np.pi*value_scaled)
    return value_sin, value_cos

In [14]:
def group_sessions(clicks_hour_df):
    def to_list(series):
        return list(series)

    clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)
    sessions_by_hour_df = clicks_hour_df.groupby('session_id').agg({'user_id': min,
                                                                    'session_start': min,
                                                                    'session_size': min,
                                                                    'click_article_id': to_list,
                                                                    'click_timestamp': to_list,
                                                                    'click_environment': to_list,
                                                                    'click_deviceGroup': to_list,
                                                                    'click_os': to_list,
                                                                    'click_country': to_list,
                                                                    'click_region': to_list,
                                                                    'click_referrer_type': to_list,
                                                                    'hour_sin': to_list,
                                                                    'hour_cos': to_list,
                                                                    'weekday_sin': to_list,
                                                                    'weekday_cos': to_list,
                                                                    'item_age_hours': to_list,
                                                                    'item_age_hours_norm': to_list,
                                                                    }
                                                                   ).reset_index()
    return sessions_by_hour_df

In [15]:
def get_time_features(timestamp, default_timezone='America/Sao_Paulo'):

    local_hour, local_weekday = extract_local_hour_weekday(int(timestamp)//1000, default_timezone)        
    
    local_hour_sin, local_hour_cos = get_cicled_feature_value(local_hour, 24)
    
    local_weekday_sin, local_weekday_cos = get_cicled_feature_value(local_weekday+1, 7)
    
    return local_hour_sin, local_hour_cos, local_weekday_sin, local_weekday_cos

In [16]:
def preprocess_interactions(clicks_df):    
    clicks_df = clicks_df.merge(articles_metadata_df, 
                                          left_on='click_article_id', right_on='article_id')
    clicks_df[['hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos']] = \
            list(clicks_df['click_timestamp'].apply(lambda ts: get_time_features(ts)).values)
    
    clicks_df['item_age_hours'] = (clicks_df['click_timestamp'] - clicks_df['created_at_ts']) / (1000*60*60)
    
    scaler = GaussRankScaler()
    clicks_df['item_age_hours_norm'] = scaler.fit_transform(clicks_df[['item_age_hours']])
    return clicks_df

In [17]:
def prepare_sessions(clicks_hour_df):    
    sessions_df = group_sessions(clicks_hour_df)
    return sessions_df

In [18]:
clicks_merged_df = preprocess_interactions(clicks_merged_df)
clicks_merged_df.head()

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,...,category_id,created_at_ts,publisher_id,words_count,hour_sin,hour_cos,weekday_sin,weekday_cos,item_age_hours,item_age_hours_norm
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,...,281,1506800518000,0,280,2.617994e-07,1.0,8.975979e-07,1.0,7.308339,-0.097621
1,20,1506825727279757,1506825727000,2,157541,1506836548634,4,1,17,1,...,281,1506800518000,0,280,0.6494482,0.760406,8.975979e-07,1.0,10.008509,0.177573
2,44,1506826139185781,1506826139000,5,157541,1506857278141,4,1,17,1,...,281,1506800518000,0,280,0.8012537,-0.598325,8.975979e-07,1.0,15.766706,0.480972
3,45,1506826142324782,1506826142000,2,157541,1506827309970,4,1,17,1,...,281,1506800518000,0,280,0.03489976,0.999391,8.975979e-07,1.0,7.442214,-0.079184
4,76,1506826463226813,1506826463000,2,157541,1506828823469,4,3,2,1,...,281,1506800518000,0,280,0.1434929,0.989651,8.975979e-07,1.0,7.86263,-0.024706


In [19]:
#hour_index_min, hour_index_max = clicks_merged_df['hour_index'].min(), clicks_merged_df['hour_index'].max()
hour_index_min, hour_index_max = 336,384
print(hour_index_min, hour_index_max)

336 384


In [21]:
counter = 1
for hour_index in range(hour_index_min, hour_index_max+1):
    clicks_hour_df = clicks_merged_df[clicks_merged_df['hour_index'] == hour_index]
    sessions_df = prepare_sessions(clicks_hour_df)
    
    if len(sessions_df) < 100:
        print("Ignoring this hour file, because has only {} sessions".format(len(sessions_df)))
        continue
    
    print(f"{hour_index}->{counter}", "# sessions: {}".format(len(sessions_df)))
    
    hour_files_path = os.path.join(OUTPUT_PATH, f"{counter:04}")
    os.makedirs(hour_files_path, exist_ok=True)
    

    train_df = sessions_df
    sessions_df.to_parquet(os.path.join(hour_files_path, "train.parquet"))    
    valid_df, test_df = train_test_split(train_df, test_size=0.8, shuffle=True)
    valid_df.sort_values(['session_start']).to_parquet(os.path.join(hour_files_path, "valid.parquet"))    
    test_df.sort_values(['session_start']).to_parquet(os.path.join(hour_files_path, "test.parquet")) 
    
    counter += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


336->1 # sessions: 851


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


337->2 # sessions: 505


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


338->3 # sessions: 282
339->4 # sessions: 196


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


340->5 # sessions: 180


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


341->6 # sessions: 329


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


342->7 # sessions: 626


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


343->8 # sessions: 1050


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


344->9 # sessions: 1555


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


345->10 # sessions: 1253


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


346->11 # sessions: 1218


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


347->12 # sessions: 1198


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


348->13 # sessions: 1126


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


349->14 # sessions: 1009


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


350->15 # sessions: 1345


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


351->16 # sessions: 2000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


352->17 # sessions: 2199


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


353->18 # sessions: 3052


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


354->19 # sessions: 3731


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


355->20 # sessions: 3303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


356->21 # sessions: 2692


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


357->22 # sessions: 2533


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


358->23 # sessions: 2007


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


359->24 # sessions: 1448


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


360->25 # sessions: 969


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


361->26 # sessions: 510


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


362->27 # sessions: 321


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


363->28 # sessions: 277


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


364->29 # sessions: 436


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


365->30 # sessions: 1027


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


366->31 # sessions: 2441


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


367->32 # sessions: 4550


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


368->33 # sessions: 5536


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


369->34 # sessions: 4784


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


370->35 # sessions: 4315


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


371->36 # sessions: 3997


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


372->37 # sessions: 5204


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


373->38 # sessions: 4648


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


374->39 # sessions: 3941


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


375->40 # sessions: 4003


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


376->41 # sessions: 3363


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


377->42 # sessions: 3108


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


378->43 # sessions: 2499


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


379->44 # sessions: 2305


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


380->45 # sessions: 2417


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


381->46 # sessions: 2717


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


382->47 # sessions: 2292


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


383->48 # sessions: 1397


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)


384->49 # sessions: 954


In [22]:
train_df.head()

Unnamed: 0,session_id,user_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,hour_sin,hour_cos,weekday_sin,weekday_cos,item_age_hours,item_age_hours_norm
0,1508207781378377,170835,1508207781000,2,"[209122, 289003]","[1508207781268, 1508207811268]","[4, 4]","[3, 3]","[2, 2]","[1, 1]","[25, 25]","[2, 2]","[0.15643472361642843, 0.15643472361642843]","[0.9876882996406566, 0.9876882996406566]","[0.9749277124471077, 0.9749277124471077]","[-0.22252180904947244, -0.22252180904947244]","[6.373407777777778, 20.439241111111112]","[-0.24979775050847963, 0.6657481651972144]"
1,1508207782230378,306892,1508207782000,3,"[50644, 211442, 16346]","[1508208270611, 1508208475029, 1508208505029]","[4, 4, 4]","[4, 4, 4]","[20, 20, 20]","[1, 1, 1]","[25, 25, 25]","[2, 2, 2]","[0.19080925236593393, 0.20364200745367583, 0.2...","[0.9816271334939521, 0.9790454191712645, 0.978...","[0.9749277124471077, 0.9749277124471077, 0.974...","[-0.22252180904947244, -0.22252180904947244, -...","[7.208780833333333, 5.765285833333333, 7.40917...","[-0.11215521852174538, -0.36901252742988516, -..."
2,1508207787174379,150849,1508207787000,2,"[209122, 205824]","[1508207787335, 1508207817335]","[4, 4]","[3, 3]","[2, 2]","[1, 1]","[25, 25]","[2, 2]","[0.15643472361642843, 0.15643472361642843]","[0.9876882996406566, 0.9876882996406566]","[0.9749277124471077, 0.9749277124471077]","[-0.22252180904947244, -0.22252180904947244]","[6.375093055555555, 6.038981944444444]","[-0.24947708306273586, -0.3129034117627942]"
3,1508207796235380,70656,1508207796000,2,"[50644, 211926]","[1508207872164, 1508207902164]","[4, 4]","[3, 3]","[20, 20]","[10, 10]","[28, 28]","[1, 1]","[0.16074282399886514, 0.16504786406963867]","[0.9869963244778927, 0.9862855583278354]","[0.9749277124471077, 0.9749277124471077]","[-0.22252180904947244, -0.22252180904947244]","[7.098101111111111, 135.4458788888889]","[-0.12870884430748442, 1.3652579204054625]"
4,1508207799183381,115448,1508207799000,4,"[50644, 36162, 211442, 156279]","[1508207871981, 1508207886297, 1508207935368, ...","[4, 4, 4, 4]","[1, 1, 1, 1]","[17, 17, 17, 17]","[1, 1, 1, 1]","[25, 25, 25, 25]","[2, 2, 2, 2]","[0.16074282399886514, 0.16504786406963867, 0.1...","[0.9869963244778927, 0.9862855583278354, 0.986...","[0.9749277124471077, 0.9749277124471077, 0.974...","[-0.22252180904947244, -0.22252180904947244, -...","[7.098050277777777, 8.532026944444445, 5.61538...","[-0.1287187457845534, 0.0488869837504763, -0.4..."


In [23]:
del train_df,test_df,valid_df,clicks_merged_df
gc.collect()

0

In [None]:
import os
import glob

import torch 
import transformers4rec.torch as tr

from transformers4rec.torch.ranking_metric import NDCGAt, RecallAt
from transformers4rec.torch.utils.examples_utils import wipe_memory 

In [None]:
from merlin_standard_lib import Schema

SCHEMA_PATH = '/input/g1schema/g1_schema.pbtxt'

x_cat_names = ['click_article_id','click_environment','click_deviceGroup','click_os','click_region','click_country']
x_cont_names = ['item_age_hours_norm','hour_sin','hour_cos','weekday_sin']

schema = Schema().from_proto_text(SCHEMA_PATH)
schema = schema.select_by_name(x_cat_names + x_cont_names)

In [23]:
!cat $SCHEMA_PATH

feature {
  name: "click_article_id"
  value_count {
    min: 2
    max: 20
  }
  type: INT
  int_domain {
    name: "click_article_id"
    min: 1
    max: 365000
    is_categorical: true
  }
  annotation {
    tag: "item_id"
    tag: "list"
    tag: "categorical"
    tag: "item"
  }
}

feature {
  name: "click_environment"
  value_count {
    min: 2
    max: 20
  }
  type: INT
  int_domain {
    name: "click_environment"
    min: 1
    max: 5
    is_categorical: true
  }
  annotation {
    tag: "list"
    tag: "categorical"
  }
}

feature {
  name: "click_deviceGroup"
  value_count {
    min: 2
    max: 20
  }
  type: INT
  int_domain {
    name: "click_deviceGroup"
    min: 1
    max: 6
    is_categorical: true
  }
  annotation {
    tag: "list"
    tag: "categorical"
  }
}

feature {
  name: "click_os"
  value_count {
    min: 2
    max: 20
  }
  type: INT
  int_domain {
    name: "click_os"
    min: 1
    max: 21
    is_categorical: true
  }
  annotation {
    tag: "list"
    tag: 

In [24]:
sequence_length, d_model = 20, 192

inputs= tr.TabularSequenceFeatures.from_schema(
    schema,
    max_sequence_length=sequence_length,
    aggregation="concat",
    continuous_projection=64,
    d_output=d_model,
    masking="clm",
)

In [25]:
transformer_config = tr.XLNetConfig.build(
    d_model=d_model, n_head=8, n_layer=4, total_seq_length=sequence_length
)

body = tr.SequentialBlock(
    inputs, tr.MLPBlock([192]), tr.TransformerBlock(transformer_config, masking=inputs.masking))


head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True, hf_format=True, 
                                     metrics=[NDCGAt(top_ks=[10, 20], labels_onehot=True),  
                                              RecallAt(top_ks=[10, 20], labels_onehot=True)]),
)


model = tr.Model(head)

In [26]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
from transformers4rec.torch.utils.examples_utils import wipe_memory

#Set arguments for training 
training_args = T4RecTrainingArguments(
            output_dir="./tmp",
            max_sequence_length=124,
            data_loader_engine='nvtabular',
            num_train_epochs=10, 
            dataloader_drop_last=False,
            per_device_train_batch_size = 384,
            per_device_eval_batch_size = 128,
            gradient_accumulation_steps = 1,
            learning_rate=0.000666,
            report_to = [],
            logging_steps=200,
)
#lr = 0.000666,0.0001426544717

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    schema=schema,
    compute_metrics=True,
)

In [28]:
OUTPUT_DIR = '/kaggle/working/clicks_preprocessed'

In [29]:
%%time
train_paths = glob.glob(os.path.join(OUTPUT_DIR, f"*/train.parquet"))
eval_paths = glob.glob(os.path.join(OUTPUT_DIR, f"*/valid.parquet"))
print('*'*20)
print('*'*20 + '\n')
trainer.train_dataset_or_path = train_paths
trainer.reset_lr_scheduler()
trainer.train()
trainer.state.global_step +=1
trainer.eval_dataset_or_path = eval_paths
train_metrics = trainer.evaluate(metric_key_prefix='eval')
print('*'*20)
print('\n' + '*'*20 + '\n')
for key in sorted(train_metrics.keys()):
    print(" %s = %s" % (key, str(train_metrics[key]))) 
wipe_memory()

********************
********************



***** Running training *****
  Num examples = 104064
  Num Epochs = 10
  Instantaneous batch size per device = 384
  Total train batch size (w. parallel, distributed & accumulation) = 384
  Gradient Accumulation steps = 1
  Total optimization steps = 2710


Step,Training Loss
200,6.7919
400,5.7117
600,5.1864
800,4.7521
1000,4.7481
1200,4.7319
1400,4.5975
1600,4.2401
1800,4.4065
2000,4.3894


Saving model checkpoint to ./tmp/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./tmp/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./tmp/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./tmp/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./tmp/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




********************

********************

 eval_/loss = 6.680943489074707
 eval_/next-item/ndcg_at_10 = 0.11256806552410126
 eval_/next-item/ndcg_at_20 = 0.13579794764518738
 eval_/next-item/recall_at_10 = 0.21644708514213562
 eval_/next-item/recall_at_20 = 0.3078519403934479
 eval_runtime = 22.7558
 eval_samples_per_second = 911.239
 eval_steps_per_second = 7.119
CPU times: user 31min 19s, sys: 3min 6s, total: 34min 26s
Wall time: 27min 27s


In [30]:
gc.collect()

23

In [31]:
test_clicks_dfs_list = []
for idx, clicks_file_path in enumerate(sorted(glob.glob(f"/kaggle/working/clicks_preprocessed/*/test.parquet"))): 
    clicks_hour_df = pd.read_parquet(clicks_file_path)
    #clicks_hour_df['hour_index'] = idx
    test_clicks_dfs_list.append(clicks_hour_df)

In [32]:
test_clicks_merged_df = pd.concat(test_clicks_dfs_list, axis=0)

In [33]:
test_clicks_merged_df = test_clicks_merged_df.sample(frac=1)
test_clicks_merged_df.to_parquet('/kaggle/working/test.parquet')

In [34]:
trainer.eval_dataset_or_path = '/kaggle/working/test.parquet'

In [35]:
test_dataloader = trainer.get_eval_dataloader()
for batch in test_dataloader:
    response = model(batch)['predictions']
    print(response)
    break

tensor([[-45.4874, -49.4355, -46.8253,  ..., -48.3618, -47.0582, -48.7985],
        [-24.6358, -25.5376, -24.1052,  ..., -24.5005, -24.6210, -24.4817],
        [-22.4441, -22.6622, -21.3095,  ..., -22.2857, -21.5335, -21.7399],
        ...,
        [-17.8239, -18.3301, -18.4281,  ..., -18.1598, -18.5321, -17.6127],
        [-34.3071, -35.4470, -32.4341,  ..., -34.7184, -33.3574, -33.7431],
        [-22.7512, -23.2058, -22.3849,  ..., -22.6659, -22.1858, -21.5739]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward0>)


In [36]:
filtered_batch = test_clicks_merged_df[:10]
filtered_batch

Unnamed: 0,session_id,user_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,hour_sin,hour_cos,weekday_sin,weekday_cos,item_age_hours,item_age_hours_norm
1630,1508165819267112,226369,1508165819000,3,"[289003, 277107, 39894]","[1508165819233, 1508167986061, 1508168016061]","[4, 4, 4]","[3, 3, 3]","[20, 20, 20]","[1, 1, 1]","[25, 25, 25]","[2, 2, 2]","[-0.24192214962248648, -0.3947440969232297, -0...","[-0.970295662940959, -0.9187911068051671, -0.9...","[0.7818320421108523, 0.7818320421108523, 0.781...","[0.6234891000881848, 0.6234891000881848, 0.623...","[8.774786944444445, 3.9536280555555554, 4.2525...","[0.07326382451882212, -0.9210975514045818, -0...."
1701,1508145780357562,126888,1508145780000,2,"[218028, 157478]","[1508146296318, 1508146326318]","[4, 4]","[1, 1]","[17, 17]","[1, 1]","[13, 13]","[2, 2]","[0.9222008704297272, 0.9205047511592386]","[-0.3867112030684572, -0.39073136947686743]","[0.7818320421108523, 0.7818320421108523]","[0.6234891000881848, 0.6234891000881848]","[16.228977222222223, 5.517866111111111]","[0.4999756105074418, -0.42242618525846426]"
1968,1508162779263454,125570,1508162779000,3,"[166322, 108858, 354904]","[1508163323283, 1508171094659, 1508171124659]","[4, 4, 4]","[1, 1, 1]","[12, 12, 12]","[1, 1, 1]","[21, 21, 21]","[6, 6, 6]","[-0.06540339046899554, -0.5877854640926065, -0...","[-0.9978589061160702, -0.8090168404931007, -0....","[0.7818320421108523, 0.7818320421108523, 0.781...","[0.6234891000881848, 0.6234891000881848, 0.623...","[6.855356388888889, 43.07407194444445, 28.4232...","[-0.16682563907747355, 1.1752041605448809, 1.0..."
3679,1508170963309366,98989,1508170963000,2,"[352979, 39894]","[1508171031554, 1508171061554]","[4, 4]","[3, 3]","[2, 2]","[1, 1]","[8, 8]","[2, 2]","[-0.5842498781069859, -0.5877854640926065]","[-0.8115738290087797, -0.8090168404931007]","[0.7818320421108523, 0.7818320421108523]","[0.6234891000881848, 0.6234891000881848]","[4.791820555555556, 5.098487222222222]","[-0.6091251458128961, -0.5238779686257783]"
3389,1508103055324995,267195,1508103055000,2,"[202355, 202559]","[1508103065295, 1508103095295]","[4, 4]","[1, 1]","[13, 13]","[9, 9]","[28, 28]","[2, 2]","[-0.9222008704297273, -0.9222008704297273]","[0.38671120306845685, 0.38671120306845685]","[8.975979006501141e-07, 8.975979006501141e-07]","[0.9999999999995972, 0.9999999999995972]","[3.7489708333333334, 4.949248611111111]","[-1.0280869276405846, -0.5640778066054596]"
3,1508063792245656,200,1508063792000,7,"[237822, 108858, 355162, 355170, 355165, 35516...","[1508064804744, 1508065358987, 1508065400338, ...","[4, 4, 4, 4, 4, 4, 4]","[1, 1, 1, 1, 1, 1, 1]","[17, 17, 17, 17, 17, 17, 17]","[1, 1, 1, 1, 1, 1, 1]","[24, 24, 24, 24, 24, 24, 24]","[2, 1, 1, 1, 1, 1, 1]","[0.7283707905018881, 0.7009090775712963, 0.697...","[-0.6851831810134095, -0.7132506326517731, -0....","[8.975979006501141e-07, 8.975979006501141e-07,...","[0.9999999999995972, 0.9999999999995972, 0.999...","[4.570762222222222, 13.703051944444445, 17.753...","[-0.6777023118494316, 0.38788113774749405, 0.5..."
883,1508142645428716,314596,1508142645000,2,"[209236, 202355]","[1508143068538, 1508143098538]","[4, 4]","[3, 3]","[2, 2]","[1, 1]","[25, 25]","[2, 2]","[0.9869963244778928, 0.9862855583278355]","[-0.16074282399886514, -0.16504786406963842]","[0.7818320421108523, 0.7818320421108523]","[0.6234891000881848, 0.6234891000881848]","[17.765705, 14.869316111111111]","[0.5630805404396134, 0.44235858534812633]"
216,1508179197784495,225744,1508179197000,2,"[277107, 352979]","[1508180038637, 1508180068637]","[4, 4]","[3, 3]","[2, 2]","[1, 1]","[7, 7]","[2, 2]","[-0.9575714362543881, -0.9588198092232034]","[-0.28819601744248347, -0.2840150936854936]","[0.7818320421108523, 0.7818320421108523]","[0.6234891000881848, 0.6234891000881848]","[7.301565833333333, 7.302121388888889]","[-0.0985728409221699, -0.09848802443792994]"
1008,1508108311914648,135360,1508108311000,2,"[202355, 209236]","[1508108360936, 1508108390936]","[4, 4]","[3, 3]","[2, 2]","[1, 1]","[11, 11]","[2, 2]","[-0.7101851913126367, -0.7101851913126367]","[0.7040149103820412, 0.7040149103820412]","[8.975979006501141e-07, 8.975979006501141e-07]","[0.9999999999995972, 0.9999999999995972]","[5.219982222222222, 8.133037777777778]","[-0.4927820552052942, 0.0066895600246576325]"
4796,1508153276257648,316016,1508153276000,2,"[157478, 352979]","[1508155521868, 1508155551868]","[4, 4]","[1, 1]","[17, 17]","[1, 1]","[25, 25]","[2, 2]","[0.4809885393928431, 0.4809885393928431]","[-0.876726881630043, -0.876726881630043]","[0.7818320421108523, 0.7818320421108523]","[0.6234891000881848, 0.6234891000881848]","[8.072185555555556, 0.4919077777777778]","[-0.00010456193813279541, -2.092482804524135]"


In [37]:
session_col = "session_id"
top_k = 5
sessions = filtered_batch[session_col].drop_duplicates().values
predictions = response.cpu().detach().numpy()
top_preds = np.argpartition(predictions, -top_k, axis=1)[:, -top_k:]
for session, next_items in zip(sessions, top_preds):
        print(
            "- Top-%s predictions for session `%s`: %s\n"
            % (top_k, session, " || ".join([str(e) for e in next_items]))
        )

- Top-5 predictions for session `1508165819267112`: 211442 || 283009 || 36162 || 50644 || 277107

- Top-5 predictions for session `1508145780357562`: 206415 || 36162 || 283009 || 158772 || 277107

- Top-5 predictions for session `1508162779263454`: 289003 || 336254 || 283009 || 331116 || 277107

- Top-5 predictions for session `1508170963309366`: 284154 || 337143 || 199474 || 97616 || 211455

- Top-5 predictions for session `1508103055324995`: 36136 || 62465 || 73506 || 119189 || 362914

- Top-5 predictions for session `1508063792245656`: 283009 || 206415 || 50644 || 36162 || 277107

- Top-5 predictions for session `1508142645428716`: 235105 || 289003 || 336254 || 277107 || 283009

- Top-5 predictions for session `1508179197784495`: 183562 || 184133 || 353415 || 342096 || 211455

- Top-5 predictions for session `1508108311914648`: 284154 || 30895 || 97616 || 299131 || 211455

- Top-5 predictions for session `1508153276257648`: 140357 || 312276 || 95982 || 226563 || 235808

