In [2]:
# import required libraries

import pandas as pd
from collections import Counter
import tensorflow as tf
from tffm.tffm import TFFMRegressor
from tffm.tffm import TFFMClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
# Loading datasets
# Download and mention the respective paths

buys = open('yoochoose-buys.dat', 'r')
clicks = open('yoochoose-clicks.dat', 'r')

In [4]:
# Reading datasets

initial_buys_df = pd.read_csv(buys, names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
                              dtype={'Session ID': 'float32', 'Timestamp': 'str', 'Item ID': 'float32',
                                     'Category': 'str'}) # read file into dataframe by column names

print(initial_buys_df.head())  # first five records
print(initial_buys_df.shape)   # shape of the dataframe

# Reading datasets

initial_clicks_df = pd.read_csv(clicks, names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
                                dtype={'Category': 'str'})

print(initial_clicks_df.head())
print(initial_clicks_df.shape)

   Session ID                 Timestamp      Item ID Category  Quantity
0    420374.0  2014-04-06T18:44:58.314Z  214537888.0    12462         1
1    420374.0  2014-04-06T18:44:58.325Z  214537856.0    10471         1
2    281626.0  2014-04-06T09:40:13.032Z  214535648.0     1883         1
3    420368.0  2014-04-04T06:13:28.848Z  214530576.0     6073         1
4    420368.0  2014-04-04T06:13:28.858Z  214835024.0     2617         1
(1150753, 5)
   Session ID                 Timestamp    Item ID Category
0           1  2014-04-07T10:51:09.277Z  214536502        0
1           1  2014-04-07T10:54:09.868Z  214536500        0
2           1  2014-04-07T10:54:46.998Z  214536506        0
3           1  2014-04-07T10:57:00.306Z  214577561        0
4           2  2014-04-07T13:56:37.614Z  214662742        0
(33003944, 4)


In [5]:
# Make 'Session ID' column as index

initial_buys_df.set_index('Session ID', inplace=True)
print(initial_buys_df.head())
print(initial_buys_df.shape)

initial_clicks_df.set_index('Session ID', inplace=True)
print(initial_clicks_df.head())
print(initial_clicks_df.shape)

                           Timestamp      Item ID Category  Quantity
Session ID                                                          
420374.0    2014-04-06T18:44:58.314Z  214537888.0    12462         1
420374.0    2014-04-06T18:44:58.325Z  214537856.0    10471         1
281626.0    2014-04-06T09:40:13.032Z  214535648.0     1883         1
420368.0    2014-04-04T06:13:28.848Z  214530576.0     6073         1
420368.0    2014-04-04T06:13:28.858Z  214835024.0     2617         1
(1150753, 4)
                           Timestamp    Item ID Category
Session ID                                              
1           2014-04-07T10:51:09.277Z  214536502        0
1           2014-04-07T10:54:09.868Z  214536500        0
1           2014-04-07T10:54:46.998Z  214536506        0
1           2014-04-07T10:57:00.306Z  214577561        0
2           2014-04-07T13:56:37.614Z  214662742        0
(33003944, 3)


In [6]:
# We won't use timestamps in this example, remove 'Timestamp' column from dataframe(df)

initial_buys_df = initial_buys_df.drop('Timestamp', 1)
print(initial_buys_df.head(n=5))
print(initial_buys_df.shape)

initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)
print(initial_clicks_df.head(n=5))
print(initial_clicks_df.shape)

                Item ID Category  Quantity
Session ID                                
420374.0    214537888.0    12462         1
420374.0    214537856.0    10471         1
281626.0    214535648.0     1883         1
420368.0    214530576.0     6073         1
420368.0    214835024.0     2617         1
(1150753, 3)
              Item ID Category
Session ID                    
1           214536502        0
1           214536500        0
1           214536506        0
1           214577561        0
2           214662742        0
(33003944, 2)


In [7]:
# For illustrative purposes, we will only use a subset of the data: top 10000 buying users,

x = Counter(initial_buys_df.index).most_common(10000) # count top 10000 most common session ID's
top_k = dict(x).keys()                                # find respective keys

initial_buys_df = initial_buys_df[initial_buys_df.index.isin(top_k)]  # Assign the most common to df
print(initial_buys_df.head())
print(initial_buys_df.shape)

initial_clicks_df = initial_clicks_df[initial_clicks_df.index.isin(top_k)]
print(initial_clicks_df.head())
print(initial_clicks_df.shape)

                Item ID Category  Quantity
Session ID                                
420471.0    214717888.0     2092         1
420471.0    214821024.0     1570         1
420471.0    214829280.0      837         1
420471.0    214819552.0      418         1
420471.0    214746384.0      784         1
(106956, 3)
              Item ID Category
Session ID                    
932         214826906        0
932         214826906        0
932         214826906        0
932         214826955        0
932         214826955        0
(209024, 2)


In [8]:
# Create a copy of the index, since we will also apply one-hot encoding on the index

initial_buys_df['_Session ID'] = initial_buys_df.index
print(initial_buys_df.head())
print(initial_buys_df.shape)

                Item ID Category  Quantity  _Session ID
Session ID                                             
420471.0    214717888.0     2092         1     420471.0
420471.0    214821024.0     1570         1     420471.0
420471.0    214829280.0      837         1     420471.0
420471.0    214819552.0      418         1     420471.0
420471.0    214746384.0      784         1     420471.0
(106956, 4)


In [9]:
# One-hot encode all columns for buys 

transformed_buys = pd.get_dummies(initial_buys_df)
print(transformed_buys.shape)
#print(transformed_buys.head())

(106956, 356)


In [10]:
# One-hot encode all columns for clicks 

transformed_clicks = pd.get_dummies(initial_clicks_df)
print(transformed_clicks.shape)
#print(transformed_clicks.head())

(209024, 56)


In [11]:
# Aggregate historical data for Items and Categories for buys

filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
print(filtered_buys.shape)
#print(filtered_buys.head())

(106956, 354)


In [12]:
# Aggregate historical data for Items and Categories for clicks

filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")
print(filtered_clicks.shape)
#print(filtered_clicks.head())

(209024, 56)


In [13]:
historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
print(historical_buy_data.shape)
#print(historical_buy_data.head())

(10000, 354)


In [14]:
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)
print(historical_buy_data.shape)
#print(historical_buy_data.head())

(10000, 354)


In [15]:
historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
print(historical_click_data.shape)
#print(historical_click_data.head())

(10000, 56)


In [16]:
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)
print(historical_click_data.shape)
#print(historical_click_data.head())

(10000, 56)


In [17]:
# Merge historical data of every user_id

merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)
print(merged1.shape)
#print(merged1.head())

merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)
print(merged2.shape)
#print(merged2.head())

(106956, 710)
(106956, 766)


In [18]:
y = np.array(merged2['Quantity'].as_matrix())
print(y.shape)
print(y[0:100])

print(y, y.shape[0])
print(y[0])
for i in range(y.shape[0]):
    if y[i]!=0:
        y[i]=1
    else:
        y[i]=0
print(y[0:100])
print(y, y.shape)

(106956,)
[ 2  2  1  2  1  1  2  2  2  1  1  1  1  1  1  1  1  1  1  1  2  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  2  2  1  2  2  2  1  2  1  1  1  1  1  1
  1  1  2  5  4  2  4  2  2  5  1  2  2  1  2  1  2  1  1  1  1  1  1  1  1
  2  1  1  2  1  1  1  1  1  2  1 10  6  1 10  1  6  1 10  1  6  1  0  0  0]
[2 2 1 ..., 2 1 1] 106956
2
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
[1 1 1 ..., 1 1 1] (106956,)


In [19]:
#print(merged2.columns)
X_tr, X_te, y_tr, y_te = train_test_split(merged2, y, test_size=0.2)

print(X_tr.shape, X_te.shape, y_tr.shape, y_te.shape)


(85564, 766) (21392, 766) (85564,) (21392,)


In [20]:
X_te, X_te_cs, y_te, y_te_cs = train_test_split(X_te, y_te, test_size=0.5)
print(X_te.shape, X_te_cs.shape, y_te.shape, y_te_cs.shape)

(10696, 766) (10696, 766) (10696,) (10696,)


In [21]:
test_x = pd.DataFrame(X_te, columns = ['Item ID'])
#test_x["y"]= y_te
print(test_x.head())
print(test_x.shape)
print(test_x.index)
#print(len(set(test_x.index)))
#print(set(test_x.index))

#list(df['preTestScore'].groupby(df['regiment']))
#print(test_x["Item ID","Quantity"].groupby(test_x.index))
#xxx=list(test_x["Item ID"].groupby(test_x.index))
#print(xxx[1])

test_x_cs = pd.DataFrame(X_te_cs, columns = ['Item ID'])
print(test_x_cs.head())

                Item ID
Session ID             
2614096     214829888.0
6388687     214845456.0
517818      214837488.0
6498748     214691520.0
2541201     214845104.0
(10696, 1)
Int64Index([ 2614096,  6388687,   517818,  6498748,  2541201,  8549471,
             7517602,  1776852,  1600304,  4882213,
            ...
             4409038,  5258609, 11543372,  8832557,  5148522,  9672578,
             2121008,  8218119,  1755582,  1509564],
           dtype='int64', name='Session ID', length=10696)
                Item ID
Session ID             
17929       214827008.0
161673      214826928.0
10914216    214854848.0
9075227     214678368.0
8356289     214716672.0


In [22]:
X_tr.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID', 'Quantity'], 1, inplace=True)
X_te.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID', 'Quantity'], 1, inplace=True)
X_te_cs.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID', 'Quantity'], 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
print(X_tr.shape, X_te.shape, y_tr.shape, y_te.shape)
print(X_te.shape, X_te_cs.shape, y_te.shape, y_te_cs.shape)

(85564, 761) (10696, 761) (85564,) (10696,)
(10696, 761) (10696, 761) (10696,) (10696,)


In [24]:
#X = np.array(merged2)
#X = np.nan_to_num(X)

ax_tr = np.array(X_tr)
ax_te = np.array(X_te)
ax_te_cs = np.array(X_te_cs)

ax_tr = np.nan_to_num(ax_tr)
ax_te = np.nan_to_num(ax_te)
ax_te_cs = np.nan_to_num(ax_te_cs)

print(np.shape(ax_tr),np.shape(ax_te),np.shape(ax_te))

(85564, 761) (10696, 761) (10696, 761)


In [25]:
# Create the MF model, you can play around with the hyper-parameters 

model = TFFMClassifier(
        order=2, 
        rank=7, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=10, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='dense',
        log_dir = '/home/kishore/upwork/logs/',
        verbose=1
        #seed=42
    )

model1 = TFFMRegressor(
    order=2,
    rank=7,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense',
    log_dir = '/home/kishore/upwork/logs/',
    verbose=1
)

In [26]:
# Cold Start

cold_start = pd.DataFrame(ax_te_cs, columns=X_tr.columns)
print(cold_start.shape)
#print(cold_start.head())

(10696, 761)


In [27]:
# What happens if we only have access to categories and no historical click/purchase data?
# Let's delete historical click and purchasing data for the cold_start test set

for column in cold_start.columns:
    if ('buy' in column or 'click' in column) and ('Category' not in column):
        cold_start[column] = 0
        
print(cold_start.shape)
#print(cold_start.head(n=10))

(10696, 761)


In [28]:
# fit the model

model.fit(ax_tr, y_tr, show_progress=True)

  0%|          | 0/10 [00:00<?, ?epoch/s]

Initialize logs, use: 
tensorboard --logdir=/home/kishore/upwork/logs


100%|██████████| 10/10 [00:32<00:00,  3.27s/epoch]


In [29]:
# Compute the mean squared error for both test sets
from sklearn.metrics import roc_auc_score, accuracy_score

predictions = model.predict(ax_te)
print(predictions.shape)
#print('MSE: {}'.format(mean_squared_error(y_te, predictions)))
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))
print("predictions:",predictions[:10])
print("actual value:",y_te[:10])

(10696,)
accuracy: 1.0
predictions: [0 0 1 0 0 1 0 1 1 0]
actual value: [0 0 1 0 0 1 0 1 1 0]


In [30]:
# Compute the mean squared error for both test sets

cold_start_predictions = model.predict(ax_te_cs)
#print('Cold-start MSE: {}'.format(mean_squared_error(y_te_cs, cold_start_predictions)))
print('Cold-start accuracy: {}'.format(accuracy_score(y_te_cs, cold_start_predictions)))
print("cold start predictions:",cold_start_predictions[:10])
print("actual value:",y_te_cs[:10])

Cold-start accuracy: 1.0
cold start predictions: [1 1 1 1 1 0 1 0 0 1]
actual value: [1 1 1 1 1 0 1 0 0 1]


In [31]:
model.destroy()

In [32]:
test_x["Predicted"] = predictions
print(test_x.head())

test_x_cs["Predicted"] = cold_start_predictions
print(test_x_cs.head())

                Item ID  Predicted
Session ID                        
2614096     214829888.0          0
6388687     214845456.0          0
517818      214837488.0          1
6498748     214691520.0          0
2541201     214845104.0          0
                Item ID  Predicted
Session ID                        
17929       214827008.0          1
161673      214826928.0          1
10914216    214854848.0          1
9075227     214678368.0          1
8356289     214716672.0          1


In [35]:
#print(len(set(test_x.index)))
sess = list(set(test_x.index))
print(sess[0])
print(len(sess))

print(test_x.loc[2614096]["Predicted"])


5505032
6578
0.0


In [40]:
fout = open("solution.dat", "w")
#print(sess[10:20])
print("writing the results into .dat file....")
for i in sess:
    #print(test_x.loc[i]["Predicted"])
    if test_x.loc[i]["Predicted"].any()!= 0:
        #print(i,";",','.join(s for s in str(test_x.loc[i]["Item ID"].tolist()).strip('[]').split(',')))
        #print(','.join(s for s in str(test_x.loc[i]["Item ID"].tolist()).strip('[]').split(',')))
        #print([int(i) for i in test_x.loc[i]["Item ID"].tolist()])
        fout.write(str(i)+";"+','.join(s for s in str(test_x.loc[i]["Item ID"].tolist()).strip('[]').split(','))+'\n')

print("completed..!!")
fout.close()

writing the results into .dat file....
completed..!!


In [39]:
sess_cs = list(set(test_x_cs.index))
print(sess_cs[0])
print(len(sess_cs))

fout = open("solution_cs.dat", "w")
#print(sess[10:20])
print("writing the cold start results into .dat file....")
for i in sess_cs:
    #print(test_x.loc[i]["Predicted"])
    if test_x_cs.loc[i]["Predicted"].any()!= 0:
        #print(i,";",','.join(s for s in str(test_x.loc[i]["Item ID"].tolist()).strip('[]').split(',')))
        #print(','.join(s for s in str(test_x.loc[i]["Item ID"].tolist()).strip('[]').split(',')))
        #print([int(i) for i in test_x.loc[i]["Item ID"].tolist()])
        fout.write(str(i)+";"+','.join(s for s in str(test_x_cs.loc[i]["Item ID"].tolist()).strip('[]').split(','))+'\n')

print("completed..!!")
fout.close()

10977282
6567
writing the cold start results into .dat file....
completed..!!
