In [None]:
import os

import pandas as pd
from pandas import read_parquet
from tqdm import tqdm

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [None]:
base_path = ".."

## Rec data

In [None]:
rec_train_session = read_parquet(f"{base_path}/orig_data/recommendation_train/train-00000-of-00001.parquet")
rec_train_session.head()
rec_train_session.shape

In [None]:
rec_train_session['request_idx'].value_counts()

In [None]:
rec_train_session['rec_result_details_with_idx'].iloc[0]

In [None]:
rec_train_data = []

for idx, row in tqdm(rec_train_session.iterrows()):
    for rec_result in row['rec_result_details_with_idx']:
        rec_train_data.append({
            'user_idx': row['user_idx'],
            'request_idx': row['request_idx'],
            'session_idx': row['session_idx'],
            'note_idx': rec_result['note_idx'],
            'click': rec_result['click'],
            'request_timestamp': rec_result['request_timestamp']
        })

rec_train = pd.DataFrame(rec_train_data)
rec_train.head(1)
rec_train.shape

In [None]:
rec_test_session = read_parquet(f"{base_path}/orig_data/recommendation_test/train-00000-of-00001.parquet")
rec_test_session.head()
rec_test_session.shape

In [None]:
rec_test_data = []

for idx, row in tqdm(rec_test_session.iterrows()):
    for rec_result in row['rec_result_details_with_idx']:
        rec_test_data.append({
            'user_idx': row['user_idx'],
            'request_idx': row['request_idx'],
            'session_idx': row['session_idx'],
            'note_idx': rec_result['note_idx'],
            'click': rec_result['click'],
            'request_timestamp': rec_result['request_timestamp']
        })

rec_test = pd.DataFrame(rec_test_data)
rec_test.head(1)
rec_test.shape

In [None]:
rec_train['orig_train'] = 1
rec_test['orig_train'] = 0

rec_data = pd.concat([rec_train, rec_test],axis=0)
rec_data = rec_data.sort_values(by=['user_idx','request_timestamp']).reset_index(drop=True)
rec_data = rec_data.rename(columns={'user_idx': 'user_id', 'note_idx': 'item_id', 'request_timestamp':'timestamp',
                                    'request_idx':'request_id','session_idx':'session_id'})
rec_data.head(1)
rec_data.shape

In [None]:
rec_data['session_id'].value_counts()

In [None]:
rec_data[rec_data['session_id']==58710]

In [None]:
rec_data['click'].value_counts()

In [None]:
rec_data['user_id'].value_counts()

In [None]:
rec_data = rec_data[rec_data['click']==1].reset_index(drop=True)
rec_data.shape

In [None]:
rec_data.to_pickle(f"{base_path}/raw_data/rec_inter.pkl")

## Src data

In [None]:
src_train_session = read_parquet(f"{base_path}/orig_data/search_train/train-00000-of-00001.parquet")
src_train_session.head()
src_train_session.shape

In [None]:
src_train_session['search_idx'].value_counts()

In [None]:
src_train_data = []

for idx, row in tqdm(src_train_session.iterrows()):
    for src_result in row['search_result_details_with_idx']:
        src_train_data.append({
            'user_idx': row['user_idx'],
            'search_idx': row['search_idx'],
            'session_idx': row['session_idx'],
            'query': row['query'],
            'query_from_type': row['query_from_type'],
            'note_idx': src_result['note_idx'],
            'click': src_result['click'],
            'search_timestamp': src_result['search_timestamp']
        })

src_train = pd.DataFrame(src_train_data)
src_train.head(1)
src_train.shape

In [None]:
src_test_session = read_parquet(f"{base_path}/orig_data/search_test/train-00000-of-00001.parquet")
src_test_session.head()
src_test_session.shape

In [None]:
src_test_session['search_idx'].value_counts()

In [None]:
src_test_data = []

for idx, row in tqdm(src_test_session.iterrows()):
    for src_result in row['search_result_details_with_idx']:
        src_test_data.append({
            'user_idx': row['user_idx'],
            'search_idx': row['search_idx'],
            'session_idx': row['session_idx'],
            'query': row['query'],
            'query_from_type': row['query_from_type'],
            'note_idx': src_result['note_idx'],
            'click': src_result['click'],
            'search_timestamp': src_result['search_timestamp']
        })

src_test = pd.DataFrame(src_test_data)
src_test.head(1)
src_test.shape

In [None]:
src_test['search_idx'].value_counts()

In [None]:
src_test[src_test['search_idx']==56606]

In [None]:
src_train['orig_train'] = 1
src_test['orig_train'] = 0

src_data = pd.concat([src_train, src_test],axis=0)
src_data = src_data.sort_values(by=['user_idx','search_timestamp']).reset_index(drop=True)
src_data = src_data.rename(columns={'user_idx': 'user_id', 'note_idx': 'item_id', 
                                    'search_timestamp':'timestamp', 'search_idx': 'search_id',
                                    'session_idx':'session_id'})
src_data.head(1)
src_data.shape

In [None]:
src_data['search_id'].value_counts()

In [None]:
src_data['query'].value_counts()

In [None]:
src_data[src_data['search_id'] == 56606]

### Process Search Data

In [None]:
src_inter = src_data.rename(columns={'search_id':'search_session_id'})

session_src_inter = src_inter.groupby(by=['user_id', 'search_session_id','query']).agg(
    click_list=('click',list),
    item_list=("item_id",list),
    time_list=("timestamp", list)
).reset_index()
session_src_inter = session_src_inter.sort_values(by=['user_id','search_session_id']).reset_index(drop=True)
session_src_inter.head()
session_src_inter.shape

In [None]:
session_src_inter['search_session_id'].value_counts()

In [None]:
def filterSessionPositive(row):
    click_list = row['click_list']
    item_list = row['item_list']
    time_list = row['time_list']

    assert len(click_list) == len(item_list)
    assert len(time_list) == len(item_list)
    new_click_list, new_item_list, new_time_list = [], [], []

    for i in range(len(click_list)):
        if click_list[i]!=0:
            new_click_list.append(click_list[i])
            new_item_list.append(item_list[i])
            new_time_list.append(time_list[i])
    
    if sum(click_list) == 0:
        new_click_list = [0]
        new_item_list = [item_list[0]]
        new_time_list = [time_list[0]]

    return pd.Series({"user_id":row['user_id'],
                      "query": row['query'],
                      "search_session_id":row['search_session_id'],
                      "click_list":new_click_list,
                      "item_list":new_item_list,
                      "time_list":new_time_list})

In [None]:
new_session_src_inter_list = []
for idx,line in tqdm(session_src_inter.iterrows()):
    new_session_src_inter_list.append(filterSessionPositive(line))
new_session_src_inter = pd.DataFrame(new_session_src_inter_list)

In [None]:
new_session_src_inter.head()
new_session_src_inter.shape

In [None]:
new_session_src_inter['num_click'] = new_session_src_inter['click_list'].apply(lambda x:len([i for i in x if i != 0]))
new_session_src_inter.head()

In [None]:
new_session_src_inter['num_click'].value_counts()

In [None]:
new_src_inter_list = []
for _, line in tqdm(new_session_src_inter.iterrows()):
    user_id, search_session_id, time_list,\
        click_list, item_list, query = line['user_id'], line['search_session_id'], \
            line['time_list'], line['click_list'], line['item_list'], line['query']
    
    for i in range(len(item_list)):
        new_src_inter_list.append((user_id,search_session_id,query,item_list[i],click_list[i],time_list[i]))

new_src_inter = pd.DataFrame(data=new_src_inter_list,
                             columns=['user_id','search_session_id','query',"item_id","click","timestamp"]
                            )
new_src_inter.head()
new_src_inter.shape

In [None]:
src_data = new_src_inter
src_data.head()
src_data.shape

In [None]:
src_data.to_pickle(f"{base_path}/raw_data/src_inter.pkl")

## Item Feat

In [None]:
note_data_list = []
for file in os.listdir(f"{base_path}/orig_data/notes/"):
    if file.endswith(".parquet"):
        note_data_list.append(read_parquet(f"{base_path}/orig_data/notes/"+file))

note_data = pd.concat(note_data_list,axis=0)
note_data.head()
note_data.shape

In [None]:
note_data['note_content'].iloc[0]

In [None]:
def get_note_text(row):
    return row['note_title'] + " " + row['note_content']

note_data['text'] = note_data.apply(get_note_text, axis=1)
note_data.head(1)

In [None]:
note_data = note_data.rename(columns={'note_idx': 'item_id'})
note_data.head(1)

In [None]:
rec_item_set = set(rec_data['item_id'].unique())
src_item_set = set(src_data['item_id'].unique())

len(rec_item_set)
len(src_item_set)
len(rec_item_set | src_item_set)
len(rec_item_set & src_item_set)

In [None]:
note_data = note_data[note_data['item_id'].isin(rec_item_set | src_item_set)].reset_index(drop=True)
note_data.shape

In [None]:
note_data.to_pickle(f"{base_path}/raw_data/item_feat.pkl")

## User Feat

In [None]:
user_feat = read_parquet(f"{base_path}/orig_data/user_feat/train-00000-of-00001.parquet")
user_feat.head(1)
user_feat.shape

In [None]:
user_feat['dense_feat39'].nunique()

In [None]:
user_feat.columns

In [None]:
user_feat = user_feat.rename(columns={'user_idx': 'user_id'})
user_feat.head(1)

In [None]:
rec_user_set = set(rec_data['user_id'].unique())
src_user_set = set(src_data['user_id'].unique())

len(rec_user_set)
len(src_user_set)
len(rec_user_set | src_user_set)
len(rec_user_set & src_user_set)

all_user_set = list(rec_user_set | src_user_set)

In [None]:
user_feat = user_feat[user_feat['user_id'].isin(all_user_set)].reset_index(drop=True)
user_feat.shape

In [None]:
user_feat.to_pickle(f"{base_path}/raw_data/user_feat.pkl")