In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import tensorflow as tf
import pandas as pd
from collections import Counter
from tffm import TFFMClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
import os
from tensorflow.python.framework import ops

In [3]:
buys = open('yoochoose-buys.dat', 'r')
clicks = open('yoochoose-clicks.dat', 'r')

In [4]:
print("Reading datasets...")
initial_buys_df = pd.read_csv(buys, names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
                              dtype={'Session ID': 'float32', 'Timestamp': 'str', 'Item ID': 'int32',
                                     'Category': 'str'}) 

initial_clicks_df = pd.read_csv(clicks, names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
                                dtype={'Item ID': 'int32','Category': 'str'})

print("Preprocessing data..")


Reading datasets...
Preprocessing data..


In [5]:
initial_buys_df.set_index('Session ID', inplace=True)
initial_clicks_df.set_index('Session ID', inplace=True)

In [6]:
initial_buys_df = initial_buys_df.drop('Timestamp', 1)
initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)

In [7]:
x = Counter(initial_buys_df.index).most_common(10000) 
top_k = dict(x).keys()                                

initial_buys_df = initial_buys_df[initial_buys_df.index.isin(top_k)]  
initial_clicks_df = initial_clicks_df[initial_clicks_df.index.isin(top_k)]

In [8]:
initial_buys_df['_Session ID'] = initial_buys_df.index


In [9]:
transformed_buys = pd.get_dummies(initial_buys_df)
transformed_clicks = pd.get_dummies(initial_clicks_df)

In [10]:
filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")
historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)
historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)

In [11]:
merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)
merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)

In [12]:
y = np.array(merged2['Quantity'].as_matrix())

In [13]:
for i in range(y.shape[0]):
    if y[i]!=0:
        y[i]=1
    else:
        y[i]=0

In [14]:
X_tr, X_te, y_tr, y_te = train_test_split(merged2, y, test_size=0.2)

In [15]:
X_te, X_te_cs, y_te, y_te_cs = train_test_split(X_te, y_te, test_size=0.5)

In [16]:
test_x = pd.DataFrame(X_te, columns = ['Item ID'])
test_x_cs = pd.DataFrame(X_te_cs, columns = ['Item ID'])

In [17]:
X_tr.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID', 'Quantity'], 1, inplace=True)
X_te.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID', 'Quantity'], 1, inplace=True)
X_te_cs.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID', 'Quantity'], 1, inplace=True)

In [18]:
ax_tr = np.array(X_tr)
ax_te = np.array(X_te)
ax_te_cs = np.array(X_te_cs)

In [19]:
ax_tr = np.nan_to_num(ax_tr)
ax_te = np.nan_to_num(ax_te)
ax_te_cs = np.nan_to_num(ax_te_cs)

In [20]:
model = TFFMClassifier(
        order=2, 
        rank=7, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=100, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='dense',
        log_dir = 'logs/',
        verbose=1,
        seed=12345
    )

In [21]:
cold_start = pd.DataFrame(ax_te_cs, columns=X_tr.columns)

In [22]:
for column in cold_start.columns:
    if ('buy' in column or 'click' in column) and ('Category' not in column):
        cold_start[column] = 0

In [23]:
model.fit(ax_tr, y_tr, show_progress=True)

Initialize logs, use: 
tensorboard --logdir=C:\Users\Test.PPMUMCPU0034\Desktop\Predictive Analytics\Section 10\logs


100%|████████████████████████████████████| 100/100 [08:02<00:00,  4.82s/epoch]


In [24]:
predictions = model.predict(ax_te)
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))

cold_start_predictions = model.predict(ax_te_cs)
print('Cold-start accuracy: {}'.format(accuracy_score(y_te_cs, cold_start_predictions)))

accuracy: 1.0
Cold-start accuracy: 1.0


In [25]:
test_x["Predicted"] = predictions
test_x_cs["Predicted"] = cold_start_predictions

In [26]:
sess = list(set(test_x.index))
fout = open("solution.dat", "w")
print("writing the results into .dat file....")
for i in sess:
    if test_x.loc[i]["Predicted"].any()!= 0:
        fout.write(str(i)+";"+','.join(s for s in str(test_x.loc[i]["Item ID"].tolist()).strip('[]').split(','))+'\n')

fout.close()

writing the results into .dat file....


In [27]:
sess_cs = list(set(test_x_cs.index))
fout = open("solution_cs.dat", "w")
print("writing the cold start results into .dat file....")
for i in sess_cs:
    if test_x_cs.loc[i]["Predicted"].any()!= 0:
        fout.write(str(i)+";"+','.join(s for s in str(test_x_cs.loc[i]["Item ID"].tolist()).strip('[]').split(','))+'\n')

fout.close()
print("completed..!!")

writing the cold start results into .dat file....
completed..!!
