In [1]:
import paths
import random
%cd {paths.base}
from os.path import join

%load_ext autoreload
%autoreload 2
from simtrain import process_dat, explore_models, utils
from simtrain import SETTINGS_POLIMI as SETTINGS
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
import scipy
from scipy import stats, sparse
import shutil
from datetime import datetime
import os
os.environ['NUMEXPR_MAX_THREADS'] = SETTINGS.NUMEXPR_MAX_THREADS

/home/thahit/github/Recommender_Sim


In [2]:
print(SETTINGS.simulation_components)
ab_test_id = SETTINGS.simulation_components['ab_test_id']
input_data_path = join(paths.dat, SETTINGS.rootpaths['input'])
full_data_path = join(input_data_path, SETTINGS.filepaths['impressions_data_test']) % ab_test_id
subset_data_path = full_data_path + '_subset'

cell_col = 'acnt.test_cell_nbr'
cells = {'train':1,
        'valid':2,
        'test_seen_users':3,
        'test_unseen_users':4}

{'ab_test_id': 'train', 'rec_model_cell_nbr': [1], 'user_model_cell_nbr': 1, 'visit_model_cell_nbr': 1}


In [3]:
train_dat, stg = process_dat.load_dat(paths.cw_stages['output']['train'])
print('settings',stg)
train_dat.drop(['Unnamed: 0.1', "Unnamed: 0"], axis=1, inplace=True)
train_dat.reward = train_dat.reward.astype(np.int32)
train_dat.head()

settings {'NI': 302, 'NU': 328, 'T': 69.9892349243164, 'NS': 100, 'INF_TIME': 1000}


Unnamed: 0,user_id,time,action,state,rec_id,reward,round_time
0,188,5.418762,119,,0,0,5.416667
1,188,5.418762,74,,0,1,5.416667
2,188,5.418762,132,74.0,0,0,5.416667
3,188,5.418762,195,74.0,0,0,5.416667
4,188,5.418762,249,74.0,0,0,5.416667


In [4]:
validation_dat, _ = process_dat.load_dat(paths.cw_stages['output']['validate'])
test_seen_users_dat, _ = process_dat.load_dat(paths.cw_stages['output']['test-seen'])
test_unseen_users_dat, _ = process_dat.load_dat(paths.cw_stages['output']['test-unseen'])

validation_dat.drop(['Unnamed: 0.1', "Unnamed: 0"], axis=1, inplace=True)
test_seen_users_dat.drop(['Unnamed: 0.1', "Unnamed: 0"], axis=1, inplace=True)
test_unseen_users_dat.drop(['Unnamed: 0.1', "Unnamed: 0"], axis=1, inplace=True)


In [5]:

def process_data(df, state_size, random_user_init = False):
    # Initialize data structures
    user_data = {}
    user_means = {}
    user_vars_log = {}

    # Group by User ID
    for user_id, group in df.groupby('user_id'):
        # Calculate user means and variances (these should be computed according to your needs)
        if random_user_init:
            user_means[user_id] = np.random.randn(state_size).tolist()
            user_vars_log[user_id] = np.random.randn(state_size).tolist()
        else:
            user_means[user_id] = np.zeros((state_size)).tolist()
            user_vars_log[user_id] = np.zeros((state_size)).tolist()

        interactions = {'item_ids':[],
                        'timestamp': [],
                        'interaction_type': []}
        
        group = group.sort_values(by='round_time')
        for timestamp, group_by_time in group.groupby('round_time'):# could  use round_time, or time
            rewards = group_by_time['reward'].tolist()
            if len(rewards)<2:
                continue
            interactions['interaction_type'].append(rewards)
            interactions['timestamp'].append(timestamp)
            interactions['item_ids'].append(group_by_time['action'].tolist())

        # Save data in a dictionary
        user_data[user_id] = {
            'user_means': user_means[user_id],
            'user_vars_log': user_vars_log[user_id],
        }
        user_data[user_id].update(interactions)

    return user_data

def convert_to_dataframe(user_data):
    rows = []
    for user_id, data in user_data.items():
        # Prepare row
        row = {
            'user_id': user_id,
            'user_means': data['user_means'],
            'user_vars_log': data['user_vars_log'],
            'item_ids': data['item_ids'],
            'timestamps': data['timestamp'],
            'interaction_types': data['interaction_type'],
        }
        rows.append(row)
    
    # Create new DataFrame
    df = pd.DataFrame(rows)
    return df

def transform_datasets(df, state_size, random_user_init = False):
    df = process_data(df, state_size, random_user_init)
    df = convert_to_dataframe(df)
    return df


In [6]:
# Transform datasets
state_size = SETTINGS.STATE_SIZE

train_transformed = transform_datasets(train_dat, state_size=state_size)
train_transformed.head()

Unnamed: 0,user_id,user_means,user_vars_log,item_ids,timestamps,interaction_types
0,188,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[[119, 74, 263, 144, 261, 53, 217, 194, 178, 2...","[5.416666666666667, 12.25, 13.645833333333334,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,491,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[[144, 279, 79, 84, 74, 247, 162, 165, 161, 13...","[54.35416666666666, 57.270833333333336, 57.354...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
2,561,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[[106, 125, 158, 27, 269, 264, 110, 50, 19, 16...","[20.58333333333333, 26.64583333333333, 28.5625...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,670,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[[165, 157, 187, 155, 95, 202, 99, 237, 288, 5...","[20.33333333333333, 26.39583333333333, 35.3125...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,749,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[[161, 199, 279, 12, 37, 84, 74, 132, 161, 284...","[5.083333333333333, 6.395833333333333, 11.8125...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [7]:
validate_transformed = transform_datasets(validation_dat, state_size=state_size)
test_seen_transformed = transform_datasets(test_seen_users_dat, state_size=state_size)
test_unseen_transformed =transform_datasets(test_unseen_users_dat, state_size=state_size)

In [8]:
train_transformed.to_csv(paths.cw_stages['output_new']['train'], index=False)
validate_transformed.to_csv(paths.cw_stages['output_new']['validate'], index=False)
test_seen_transformed.to_csv(paths.cw_stages['output_new']['test-seen'], index=False)
test_unseen_transformed.to_csv(paths.cw_stages['output_new']['test-unseen'], index=False)