In [1]:
# import required libraries

import pandas as pd
from collections import Counter
import tensorflow as tf
from tffm.tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Loading datasets
# Download and mention the respective paths

buys = open('yoochoose-buys.dat', 'r')
clicks = open('yoochoose-clicks.dat', 'r')

In [3]:
# Reading datasets

initial_buys_df = pd.read_csv(buys, names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
                              dtype={'Session ID': 'float32', 'Timestamp': 'str', 'Item ID': 'float32',
                                     'Category': 'str'}) # read file into dataframe by column names

print(initial_buys_df.head())  # first five records
print(initial_buys_df.shape)   # shape of the dataframe

# Reading datasets

initial_clicks_df = pd.read_csv(clicks, names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
                                dtype={'Category': 'str'})

print(initial_clicks_df.head())
print(initial_clicks_df.shape)

   Session ID                 Timestamp      Item ID Category  Quantity
0    420374.0  2014-04-06T18:44:58.314Z  214537888.0    12462         1
1    420374.0  2014-04-06T18:44:58.325Z  214537856.0    10471         1
2    281626.0  2014-04-06T09:40:13.032Z  214535648.0     1883         1
3    420368.0  2014-04-04T06:13:28.848Z  214530576.0     6073         1
4    420368.0  2014-04-04T06:13:28.858Z  214835024.0     2617         1
(1150753, 5)
   Session ID                 Timestamp    Item ID Category
0           1  2014-04-07T10:51:09.277Z  214536502        0
1           1  2014-04-07T10:54:09.868Z  214536500        0
2           1  2014-04-07T10:54:46.998Z  214536506        0
3           1  2014-04-07T10:57:00.306Z  214577561        0
4           2  2014-04-07T13:56:37.614Z  214662742        0
(33003944, 4)


In [4]:
# Make 'Session ID' column as index

initial_buys_df.set_index('Session ID', inplace=True)
print(initial_buys_df.head())
print(initial_buys_df.shape)

initial_clicks_df.set_index('Session ID', inplace=True)
print(initial_clicks_df.head())
print(initial_clicks_df.shape)

                           Timestamp      Item ID Category  Quantity
Session ID                                                          
420374.0    2014-04-06T18:44:58.314Z  214537888.0    12462         1
420374.0    2014-04-06T18:44:58.325Z  214537856.0    10471         1
281626.0    2014-04-06T09:40:13.032Z  214535648.0     1883         1
420368.0    2014-04-04T06:13:28.848Z  214530576.0     6073         1
420368.0    2014-04-04T06:13:28.858Z  214835024.0     2617         1
(1150753, 4)
                           Timestamp    Item ID Category
Session ID                                              
1           2014-04-07T10:51:09.277Z  214536502        0
1           2014-04-07T10:54:09.868Z  214536500        0
1           2014-04-07T10:54:46.998Z  214536506        0
1           2014-04-07T10:57:00.306Z  214577561        0
2           2014-04-07T13:56:37.614Z  214662742        0
(33003944, 3)


In [5]:
# We won't use timestamps in this example, remove 'Timestamp' column from dataframe(df)

initial_buys_df = initial_buys_df.drop('Timestamp', 1)
print(initial_buys_df.head(n=5))
print(initial_buys_df.shape)

initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)
print(initial_clicks_df.head(n=5))
print(initial_clicks_df.shape)

                Item ID Category  Quantity
Session ID                                
420374.0    214537888.0    12462         1
420374.0    214537856.0    10471         1
281626.0    214535648.0     1883         1
420368.0    214530576.0     6073         1
420368.0    214835024.0     2617         1
(1150753, 3)
              Item ID Category
Session ID                    
1           214536502        0
1           214536500        0
1           214536506        0
1           214577561        0
2           214662742        0
(33003944, 2)


In [6]:
# For illustrative purposes, we will only use a subset of the data: top 10000 buying users,

x = Counter(initial_buys_df.index).most_common(10000) # count top 10000 most common session ID's
top_k = dict(x).keys()                                # find respective keys

initial_buys_df = initial_buys_df[initial_buys_df.index.isin(top_k)]  # Assign the most common to df
print(initial_buys_df.head())
print(initial_buys_df.shape)

initial_clicks_df = initial_clicks_df[initial_clicks_df.index.isin(top_k)]
print(initial_clicks_df.head())
print(initial_clicks_df.shape)

                Item ID Category  Quantity
Session ID                                
420471.0    214717888.0     2092         1
420471.0    214821024.0     1570         1
420471.0    214829280.0      837         1
420471.0    214819552.0      418         1
420471.0    214746384.0      784         1
(106956, 3)
              Item ID Category
Session ID                    
932         214826906        0
932         214826906        0
932         214826906        0
932         214826955        0
932         214826955        0
(209024, 2)


In [7]:
# Create a copy of the index, since we will also apply one-hot encoding on the index

initial_buys_df['_Session ID'] = initial_buys_df.index
print(initial_buys_df.head())
print(initial_buys_df.shape)

                Item ID Category  Quantity  _Session ID
Session ID                                             
420471.0    214717888.0     2092         1     420471.0
420471.0    214821024.0     1570         1     420471.0
420471.0    214829280.0      837         1     420471.0
420471.0    214819552.0      418         1     420471.0
420471.0    214746384.0      784         1     420471.0
(106956, 4)


In [8]:
# One-hot encode all columns for buys 

transformed_buys = pd.get_dummies(initial_buys_df)
print(transformed_buys.shape)
# print(transformed_buys.head())

(106956, 356)


In [9]:
# One-hot encode all columns for clicks 

transformed_clicks = pd.get_dummies(initial_clicks_df)
print(transformed_clicks.shape)
#print(transformed_clicks.head())

(209024, 56)


In [10]:
# Aggregate historical data for Items and Categories for buys

filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
print(filtered_buys.shape)
#print(filtered_buys.head())

(106956, 354)


In [11]:
# Aggregate historical data for Items and Categories for clicks

filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")
print(filtered_clicks.shape)
#print(filtered_clicks.head())

(209024, 56)


In [12]:
historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
print(historical_buy_data.shape)
#print(historical_buy_data.head())

(10000, 354)


In [13]:
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)
print(historical_buy_data.shape)
#print(historical_buy_data.head())

(10000, 354)


In [14]:
historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
print(historical_click_data.shape)
#print(historical_click_data.head())

(10000, 56)


In [15]:
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)
print(historical_click_data.shape)
#print(historical_click_data.head())

(10000, 56)


In [16]:
# Merge historical data of every user_id

merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)
print(merged1.shape)
#print(merged1.head())

merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)
print(merged2.shape)
#print(merged2.head())

(106956, 710)
(106956, 766)


In [17]:
y = np.array(merged2['Quantity'].as_matrix())
print(y.shape)
print(y[0:10])

(106956,)
[2 2 1 2 1 1 2 2 2 1]


In [18]:
merged2.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID', 'Quantity'], 1, inplace=True)
#print(merged2.columns)
X = np.array(merged2)
print(len(X),len(X[0]))
X = np.nan_to_num(X)
print(len(X),len(X[0]))

106956 762
106956 762


In [19]:
print(len(X),X[0]) # number of records and values in one record

106956 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  

In [20]:
# Create the MF model, you can play around with the hyper-parameters 

model = TFFMRegressor(
    order=2,
    rank=7,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense',
    log_dir = '/home/kishore/upwork/logs/',
    verbose=1
)

In [21]:
# Split data into train, test

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)

print(X_tr.shape, X_te.shape, y_tr.shape, y_te.shape)

(85564, 762) (21392, 762) (85564,) (21392,)


In [22]:
#Split testing data in half: Full information vs Cold-start

X_te, X_te_cs, y_te, y_te_cs = train_test_split(X_te, y_te, test_size=0.5)
print(X_te.shape, X_te_cs.shape, y_te.shape, y_te_cs.shape)

(10696, 762) (10696, 762) (10696,) (10696,)


In [23]:
# Cold Start

cold_start = pd.DataFrame(X_te_cs, columns=merged2.columns)
print(cold_start.shape)
#print(cold_start.head())

(10696, 762)


In [24]:
# What happens if we only have access to categories and no historical click/purchase data?
# Let's delete historical click and purchasing data for the cold_start test set

for column in cold_start.columns:
    if ('buy' in column or 'click' in column) and ('Category' not in column):
        cold_start[column] = 0
        
print(cold_start.shape)
#print(cold_start.head(n=10))

(10696, 762)


In [25]:
# fit the model

model.fit(X_tr, y_tr, show_progress=True)

  0%|          | 0/100 [00:00<?, ?epoch/s]

Initialize logs, use: 
tensorboard --logdir=/home/kishore/upwork/logs


100%|██████████| 100/100 [04:37<00:00,  2.72s/epoch]


In [26]:
# Compute the mean squared error for both test sets

predictions = model.predict(X_te)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))
print("predictions:",predictions[:10])
print("actual value:",y_te[:10])

MSE: 0.3576960753907961
predictions: [-0.82515335 -0.36797437  0.04699847  0.34387678  2.74649882  2.10551763
  0.07465315  0.86455691  0.81601524  0.9363333 ]
actual value: [0 0 0 0 2 2 1 1 1 1]


In [27]:
# Compute the mean squared error for both test sets

cold_start_predictions = model.predict(X_te_cs)
print('Cold-start MSE: {}'.format(mean_squared_error(y_te_cs, cold_start_predictions)))
print("cold start predictions:",cold_start_predictions[:10])
print("actual value:",y_te_cs[:10])

Cold-start MSE: 0.37049781857308084
cold start predictions: [ 0.10383224  0.05325939  1.07749355 -0.77810884  1.35151124  0.79925859
 -0.23732901  0.3737044   1.4157995   3.11727738]
actual value: [0 0 1 0 2 1 0 1 0 1]


In [28]:
model.destroy()