In [195]:
import numpy as np
import pandas as pd
import tensorflow as tf
from math import sqrt
import holidays
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.metrics import mean_absolute_error
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense
from deepctr.models import WDL
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
from sklearn.model_selection import RandomizedSearchCV
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
import implicit

In [207]:
train_df = pd.read_csv('./train/train.csv', sep=',', nrows=100000)

In [208]:
train_df.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


#### renaming the columns hotel_cluster and is_booking to hotel_id and rating

In [209]:
train_df = train_df.rename(columns={'hotel_cluster': 'item_id', 'is_booking': 'rating'})

In [210]:
train_df = train_df.drop(['orig_destination_distance'],axis=1)
train_df = train_df.dropna()

In [211]:
train_df.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,user_id,is_mobile,is_package,channel,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,rating,cnt,hotel_continent,hotel_country,hotel_market,item_id
0,2014-08-11 07:46:59,2,3,66,348,48862,12,0,1,9,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,12,0,1,9,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,12,0,0,9,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,93,0,0,3,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,93,0,0,3,...,0,1,14984,1,0,1,2,50,1457,21


#### we are listing out a new columns for america and europe to see if customers are checking in holidays

In [212]:
# Initially, we define hoilday dates for america
holidays_america = holidays.UnitedStates()
holidays_canada = holidays.Canada()

#### Setting up new columns in the train_df dataframe. 1 indicates that customers have checked on holidays and 0 indicate that customers have not checked on holidays

In [213]:
train_df['america_checkin'] = train_df['srch_ci'].apply(lambda holiday : 1 if holiday in (holidays_canada or holidays_america) else 0)
train_df['america_checkout'] = train_df['srch_co'].apply(lambda holiday : 1 if holiday in (holidays_canada or holidays_america) else 0)

In [214]:
# Now, we define hoilday dates for europe
holidays_uk = holidays.UnitedKingdom()
holidays_germany = holidays.Germany()

In [215]:
train_df['europe_checkin'] = train_df['srch_ci'].apply(lambda holiday : 1 if holiday in (holidays_uk or holidays_germany) else 0)
train_df['europe_checkout'] = train_df['srch_co'].apply(lambda holiday : 1 if holiday in (holidays_uk or holidays_germany) else 0)

In [216]:
train_df.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,user_id,is_mobile,is_package,channel,...,rating,cnt,hotel_continent,hotel_country,hotel_market,item_id,america_checkin,america_checkout,europe_checkin,europe_checkout
0,2014-08-11 07:46:59,2,3,66,348,48862,12,0,1,9,...,0,3,2,50,628,1,0,0,0,0
1,2014-08-11 08:22:12,2,3,66,348,48862,12,0,1,9,...,1,1,2,50,628,1,0,0,0,0
2,2014-08-11 08:24:33,2,3,66,348,48862,12,0,0,9,...,0,1,2,50,628,1,0,0,0,0
3,2014-08-09 18:05:16,2,3,66,442,35390,93,0,0,3,...,0,1,2,50,1457,80,0,0,0,0
4,2014-08-09 18:08:18,2,3,66,442,35390,93,0,0,3,...,0,1,2,50,1457,21,0,0,0,0


#### As we do not get meaningful insights from datetime fields, we extract month from it and drop them

In [217]:
train_df['month_ci'] = pd.DatetimeIndex(train_df['srch_ci'])
train_df['month_co'] = pd.DatetimeIndex(train_df['srch_co'])
train_df['month_click'] = pd.DatetimeIndex(train_df['date_time'])

In [218]:
train_df = train_df.drop(['srch_ci', 'srch_co', 'date_time'], axis = 1)

In [219]:
train_df.shape

(99929, 27)

### Feature engineering is done where the inputs (both categorical and continuous) are fed to wide and deep neural networks

In [110]:
# All categorical attributes are put in sparse features
categotical_cols = []
for col in train_df.columns:
    if col not in ['hotel_market', 'rating']:
        categotical_cols.append(col)
sparse_features = categotical_cols

In [111]:
dense_features = ['hotel_market']
target = ['rating']

#### In the preprocessing step, we encode the sparse categorical features for embedding using Label Encoding

In [112]:
for feature in sparse_features:
    label = LabelEncoder()
    train_df[feature] = label.fit_transform(train_df[feature])

#### We use normalization like MinMaxScaler methods for dense featues as they are discretized to buckets

In [113]:
mms = MinMaxScaler(feature_range=(0,1))
train_df[dense_features] = mms.fit_transform(train_df[dense_features])

#### We use embedding techniques to turn sparse features into dense vectors. We concatenate dense features to the fully connected layer's input tensors.

In [114]:
fixlen_feature_columns = [SparseFeat(feat, train_df[feat].nunique(), embedding_dim = 4)
                          for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [115]:
train_data, test_data = train_test_split(train_df, test_size = 0.3)
train_data.shape, test_data.shape

((1048856, 24), (449510, 24))

In [116]:
train_data_input = {name : train_data[name].values for name in feature_names}
test_data_input = {name : test_data[name].values for name in feature_names}

#### Now, we apply tensorflow's WDL library and compile the model

In [None]:
model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128, 128), l2_reg_embedding=1e-5,
            l2_reg_dnn = 0, l2_reg_linear = 1e-5, seed = 1024, dnn_activation = 'relu', task = 'binary')


model.compile("adam", "mse", metrics=['mse'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
history = model.fit(train_data_input, train_data[target].values, batch_size = 256, epochs = 10, verbose = 2, validation_split = 0.2)

Train on 55960 samples, validate on 13990 samples
Epoch 1/10
55960/55960 - 5s - loss: 0.0839 - mean_squared_error: 0.0838 - val_loss: 0.0685 - val_mean_squared_error: 0.0682
Epoch 2/10
55960/55960 - 3s - loss: 0.0620 - mean_squared_error: 0.0613 - val_loss: 0.0716 - val_mean_squared_error: 0.0705
Epoch 3/10
55960/55960 - 3s - loss: 0.0037 - mean_squared_error: 0.0026 - val_loss: 0.0712 - val_mean_squared_error: 0.0701
Epoch 4/10
55960/55960 - 3s - loss: 0.0015 - mean_squared_error: 4.7879e-04 - val_loss: 0.0714 - val_mean_squared_error: 0.0705
Epoch 5/10
55960/55960 - 3s - loss: 9.4457e-04 - mean_squared_error: 1.5035e-04 - val_loss: 0.0728 - val_mean_squared_error: 0.0721
Epoch 6/10
55960/55960 - 3s - loss: 7.0863e-04 - mean_squared_error: 6.7340e-05 - val_loss: 0.0741 - val_mean_squared_error: 0.0736
Epoch 7/10
55960/55960 - 3s - loss: 5.8648e-04 - mean_squared_error: 5.8733e-05 - val_loss: 0.0747 - val_mean_squared_error: 0.0743
Epoch 8/10
55960/55960 - 3s - loss: 4.8544e-04 - mean_

In [None]:
pred = model.predict(test_data_input, batch_size=256)

In [117]:
def rmse_score(pred):
    return sqrt(mean_squared_error(test_data[target].values, pred))

In [None]:
rmse_val = rmse_score(pred)
print("RMSE = ", rmse_val)

RMSE =  0.28


#### We perfome hyper parameter tuning using Random Search CV 

In [None]:
rand_parameters = {'dnn_hidden_units' : [(1,1),(2,2),(4,4),(8,8),(29,29),(128,128),(256,256)],
                   'l2_reg_linear':[1e-5, 1e-3, 1e-1, 1, 10],
                   'l2_reg_embedding':[1e-7, 1e-5, 1e-3, 1e-1, 1],
                   'dnn_dropout': np.arange(0, 1, 0.2),
                   'l2_reg_dnn':[0, 0.2, 2, 4]
                  }

In [None]:
dnn_hidden_units_rmse = {}
for k in rand_parameters['dnn_hidden_units']:
    model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units = k, seed = 1024, task ='binary')
    model.compile("adam", "mse", metrics=['mse'])
    history = model.fit(train_data_input, train_data[target].values, batch_size=256, validation_split=0.2, epochs=10, verbose=2)
    pred = model.predict(test_data_input, batch_size=256)
    dnn_hidden_units_rmse[k]={"RMSE": np.round(sqrt(mean_squared_error(test_data[target].values, pred)),2)}
dnn_hidden_units_rmse

Train on 55960 samples, validate on 13990 samples
Epoch 1/10
55960/55960 - 5s - loss: 0.1045 - mean_squared_error: 0.1044 - val_loss: 0.0755 - val_mean_squared_error: 0.0754
Epoch 2/10
55960/55960 - 3s - loss: 0.0754 - mean_squared_error: 0.0752 - val_loss: 0.0736 - val_mean_squared_error: 0.0734
Epoch 3/10
55960/55960 - 3s - loss: 0.0738 - mean_squared_error: 0.0735 - val_loss: 0.0729 - val_mean_squared_error: 0.0725
Epoch 4/10
55960/55960 - 3s - loss: 0.0725 - mean_squared_error: 0.0721 - val_loss: 0.0723 - val_mean_squared_error: 0.0718
Epoch 5/10
55960/55960 - 3s - loss: 0.0711 - mean_squared_error: 0.0705 - val_loss: 0.0719 - val_mean_squared_error: 0.0712
Epoch 6/10
55960/55960 - 3s - loss: 0.0698 - mean_squared_error: 0.0690 - val_loss: 0.0717 - val_mean_squared_error: 0.0708
Epoch 7/10
55960/55960 - 3s - loss: 0.0687 - mean_squared_error: 0.0677 - val_loss: 0.0716 - val_mean_squared_error: 0.0705
Epoch 8/10
55960/55960 - 3s - loss: 0.0677 - mean_squared_error: 0.0665 - val_loss

{(1, 1): {'RMSE': 0.27},
 (2, 2): {'RMSE': 0.27},
 (4, 4): {'RMSE': 0.27},
 (8, 8): {'RMSE': 0.27},
 (29, 29): {'RMSE': 0.27},
 (128, 128): {'RMSE': 0.28},
 (256, 256): {'RMSE': 0.28}}

In [None]:
l2_reg_embedding_rmse = {}
for k in rand_parameters['l2_reg_embedding']:
    model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units = (2,2), l2_reg_embedding = k, l2_reg_linear = 0.001, seed = 1024, task ='binary')
    model.compile("adam", "mse", metrics=['mse'])
    history = model.fit(train_data_input, train_data[target].values, batch_size=256, validation_split=0.2, epochs=10, verbose=2)
    pred = model.predict(test_data_input, batch_size=256)
    l2_reg_embedding_rmse[k]={"RMSE": np.round(sqrt(mean_squared_error(test_data[target].values, pred)),4)}
l2_reg_embedding_rmse

Train on 55960 samples, validate on 13990 samples
Epoch 1/10
55960/55960 - 8s - loss: 0.1068 - mean_squared_error: 0.1044 - val_loss: 0.0788 - val_mean_squared_error: 0.0761
Epoch 2/10
55960/55960 - 4s - loss: 0.0787 - mean_squared_error: 0.0766 - val_loss: 0.0759 - val_mean_squared_error: 0.0742
Epoch 3/10
55960/55960 - 4s - loss: 0.0773 - mean_squared_error: 0.0757 - val_loss: 0.0751 - val_mean_squared_error: 0.0737
Epoch 4/10
55960/55960 - 4s - loss: 0.0766 - mean_squared_error: 0.0753 - val_loss: 0.0747 - val_mean_squared_error: 0.0733
Epoch 5/10
55960/55960 - 4s - loss: 0.0762 - mean_squared_error: 0.0749 - val_loss: 0.0744 - val_mean_squared_error: 0.0730
Epoch 6/10
55960/55960 - 4s - loss: 0.0759 - mean_squared_error: 0.0746 - val_loss: 0.0741 - val_mean_squared_error: 0.0727
Epoch 7/10
55960/55960 - 4s - loss: 0.0757 - mean_squared_error: 0.0743 - val_loss: 0.0739 - val_mean_squared_error: 0.0725
Epoch 8/10
55960/55960 - 4s - loss: 0.0756 - mean_squared_error: 0.0740 - val_loss

{1e-07: {'RMSE': 0.27},
 1e-05: {'RMSE': 0.2701},
 0.001: {'RMSE': 0.2701},
 0.1: {'RMSE': 0.2686},
 1: {'RMSE': 0.2701}}

In [None]:
l2_reg_linear_rmse = {}
for k in rand_parameters['l2_reg_linear']:
    model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units = (2,2), l2_reg_linear = k, l2_reg_embedding = 0.1, seed = 1024, task ='binary')
    model.compile("adam", "mse", metrics=['mse'])
    history = model.fit(train_data_input, train_data[target].values, batch_size=256, validation_split=0.2, epochs=10, verbose=2)
    pred = model.predict(test_data_input, batch_size=256)
    l2_reg_linear_rmse[k]={"RMSE": np.round(sqrt(mean_squared_error(test_data[target].values, pred)),2)}
l2_reg_embedding_rmse

Train on 55960 samples, validate on 13990 samples
Epoch 1/10
55960/55960 - 13s - loss: 0.1034 - mean_squared_error: 0.1028 - val_loss: 0.0748 - val_mean_squared_error: 0.0746
Epoch 2/10
55960/55960 - 5s - loss: 0.0753 - mean_squared_error: 0.0750 - val_loss: 0.0736 - val_mean_squared_error: 0.0733
Epoch 3/10
55960/55960 - 4s - loss: 0.0739 - mean_squared_error: 0.0735 - val_loss: 0.0729 - val_mean_squared_error: 0.0725
Epoch 4/10
55960/55960 - 5s - loss: 0.0725 - mean_squared_error: 0.0720 - val_loss: 0.0723 - val_mean_squared_error: 0.0717
Epoch 5/10
55960/55960 - 5s - loss: 0.0711 - mean_squared_error: 0.0703 - val_loss: 0.0720 - val_mean_squared_error: 0.0710
Epoch 6/10
55960/55960 - 4s - loss: 0.0699 - mean_squared_error: 0.0687 - val_loss: 0.0718 - val_mean_squared_error: 0.0705
Epoch 7/10
55960/55960 - 4s - loss: 0.0687 - mean_squared_error: 0.0673 - val_loss: 0.0716 - val_mean_squared_error: 0.0701
Epoch 8/10
55960/55960 - 4s - loss: 0.0679 - mean_squared_error: 0.0660 - val_los

{1e-07: {'RMSE': 0.27},
 1e-05: {'RMSE': 0.2701},
 0.001: {'RMSE': 0.2701},
 0.1: {'RMSE': 0.2686},
 1: {'RMSE': 0.2701}}

In [None]:
l2_reg_dnn_rmse = {}
for k in rand_parameters['l2_reg_dnn']:
    model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units = (2,2), l2_reg_dnn = k, l2_reg_embedding = 0.1, l2_reg_linear = 0.1, seed = 1024, task ='binary')
    model.compile("adam", "mse", metrics=['mse'])
    history = model.fit(train_data_input, train_data[target].values, batch_size=256, validation_split=0.2, epochs=10, verbose=2)
    pred = model.predict(test_data_input, batch_size=256)
    l2_reg_dnn_rmse[k]={"RMSE": np.round(sqrt(mean_squared_error(test_data[target].values, pred)),2)}
l2_reg_dnn_rmse

Train on 55960 samples, validate on 13990 samples
Epoch 1/10
55960/55960 - 13s - loss: 0.1257 - mean_squared_error: 0.1128 - val_loss: 0.0938 - val_mean_squared_error: 0.0796
Epoch 2/10
55960/55960 - 5s - loss: 0.0879 - mean_squared_error: 0.0785 - val_loss: 0.0804 - val_mean_squared_error: 0.0750
Epoch 3/10
55960/55960 - 5s - loss: 0.0797 - mean_squared_error: 0.0764 - val_loss: 0.0762 - val_mean_squared_error: 0.0739
Epoch 4/10
55960/55960 - 5s - loss: 0.0774 - mean_squared_error: 0.0756 - val_loss: 0.0751 - val_mean_squared_error: 0.0734
Epoch 5/10
55960/55960 - 5s - loss: 0.0767 - mean_squared_error: 0.0749 - val_loss: 0.0747 - val_mean_squared_error: 0.0729
Epoch 6/10
55960/55960 - 5s - loss: 0.0765 - mean_squared_error: 0.0744 - val_loss: 0.0748 - val_mean_squared_error: 0.0724
Epoch 7/10
55960/55960 - 5s - loss: 0.0764 - mean_squared_error: 0.0740 - val_loss: 0.0749 - val_mean_squared_error: 0.0720
Epoch 8/10
55960/55960 - 5s - loss: 0.0766 - mean_squared_error: 0.0738 - val_los

{0: {'RMSE': 0.27}, 0.2: {'RMSE': 0.27}, 2: {'RMSE': 0.27}, 4: {'RMSE': 0.27}}

In [None]:
dnn_dropout_rmse = {}
for k in rand_parameters['dnn_dropout']:
    model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units = (2,2), dnn_dropout = k, dnn_activation = 'relu', seed = 1024, task ='binary')
    model.compile("adam", "mse", metrics=['mse'])
    history = model.fit(train_data_input, train_data[target].values, batch_size=256, validation_split=0.2, epochs=10, verbose=2)
    pred = model.predict(test_data_input, batch_size=256)
    dnn_dropout_rmse[k]={"RMSE": np.round(sqrt(mean_squared_error(test_data[target].values, pred)),4)}
dnn_dropout_rmse

Train on 55960 samples, validate on 13990 samples
Epoch 1/10
55960/55960 - 14s - loss: 0.1036 - mean_squared_error: 0.1035 - val_loss: 0.0753 - val_mean_squared_error: 0.0752
Epoch 2/10
55960/55960 - 6s - loss: 0.0753 - mean_squared_error: 0.0751 - val_loss: 0.0735 - val_mean_squared_error: 0.0733
Epoch 3/10
55960/55960 - 6s - loss: 0.0738 - mean_squared_error: 0.0736 - val_loss: 0.0728 - val_mean_squared_error: 0.0725
Epoch 4/10
55960/55960 - 6s - loss: 0.0724 - mean_squared_error: 0.0720 - val_loss: 0.0723 - val_mean_squared_error: 0.0718
Epoch 5/10
55960/55960 - 5s - loss: 0.0710 - mean_squared_error: 0.0705 - val_loss: 0.0719 - val_mean_squared_error: 0.0712
Epoch 6/10
55960/55960 - 6s - loss: 0.0698 - mean_squared_error: 0.0690 - val_loss: 0.0716 - val_mean_squared_error: 0.0707
Epoch 7/10
55960/55960 - 6s - loss: 0.0686 - mean_squared_error: 0.0676 - val_loss: 0.0716 - val_mean_squared_error: 0.0705
Epoch 8/10
55960/55960 - 6s - loss: 0.0677 - mean_squared_error: 0.0664 - val_los

{0.0: {'RMSE': 0.266},
 0.2: {'RMSE': 0.2661},
 0.4: {'RMSE': 0.266},
 0.6000000000000001: {'RMSE': 0.2661},
 0.8: {'RMSE': 0.2657}}

#### Applying the best parameters to the Wide and Deep Learning Model

In [118]:
model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units = (2,2), dnn_dropout = 0.8, dnn_activation = 'relu', 
            l2_reg_embedding = 0.1, l2_reg_linear = 0.1, l2_reg_dnn = 0, seed = 1024, task ='binary')
model.compile("adam", "mse", metrics=['mse'])
history = model.fit(train_data_input, train_data[target].values, batch_size=256, validation_split=0.2, epochs=10, verbose=2)

Train on 839084 samples, validate on 209772 samples
Epoch 1/10
839084/839084 - 36s - loss: 0.0914 - mean_squared_error: 0.0826 - val_loss: 0.0774 - val_mean_squared_error: 0.0761
Epoch 2/10
839084/839084 - 34s - loss: 0.0757 - mean_squared_error: 0.0753 - val_loss: 0.0750 - val_mean_squared_error: 0.0748
Epoch 3/10
839084/839084 - 34s - loss: 0.0752 - mean_squared_error: 0.0750 - val_loss: 0.0750 - val_mean_squared_error: 0.0748
Epoch 4/10
839084/839084 - 34s - loss: 0.0752 - mean_squared_error: 0.0750 - val_loss: 0.0750 - val_mean_squared_error: 0.0748
Epoch 5/10
839084/839084 - 34s - loss: 0.0752 - mean_squared_error: 0.0750 - val_loss: 0.0750 - val_mean_squared_error: 0.0748
Epoch 6/10
839084/839084 - 34s - loss: 0.0752 - mean_squared_error: 0.0750 - val_loss: 0.0750 - val_mean_squared_error: 0.0748
Epoch 7/10
839084/839084 - 33s - loss: 0.0752 - mean_squared_error: 0.0750 - val_loss: 0.0750 - val_mean_squared_error: 0.0748
Epoch 8/10
839084/839084 - 34s - loss: 0.0752 - mean_square

In [119]:
pred = model.predict(test_data_input, batch_size=256)

In [120]:
rmse_val = rmse_score(pred)
print("RMSE = ", rmse_val)

RMSE =  0.2748952325282919


In [164]:
df = test_data[['user_id', 'item_id', 'rating']]

In [165]:
df.head()

Unnamed: 0,user_id,item_id,rating
792166,8709,9,0
1471463,44743,98,0
1459956,43144,55,0
783062,7778,77,1
997079,35597,6,0


In [166]:
df['rating'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [167]:
df

Unnamed: 0,user_id,item_id,rating
792166,8709,9,0.082812
1471463,44743,98,0.081680
1459956,43144,55,0.083414
783062,7778,77,0.083016
997079,35597,6,0.082072
...,...,...,...
969444,31853,48,0.085057
1153569,5849,22,0.082337
182059,26626,56,0.082610
130124,19854,9,0.083331


In [168]:
df.describe()

Unnamed: 0,user_id,item_id,rating
count,449510.0,449510.0,449510.0
mean,24652.704242,50.008652,0.082657
std,14143.84915,28.953618,0.001215
min,1.0,0.0,0.078419
25%,12289.0,25.0,0.081844
50%,24769.0,50.0,0.08274
75%,37104.0,73.0,0.083546
max,48725.0,99.0,0.085685


In [158]:
sparse_item_user = csr_matrix((df['rating'].astype(float),(df['item_id'], df['user_id'])))
sparse_user_item = csr_matrix((df['rating'].astype(float),(df['user_id'], df['item_id'])))


model = implicit.als.AlternatingLeastSquares(factors=20,regularization=0.1,iterations=20)
alpha_val = 15
data_conf = (sparse_user_item * alpha_val).astype('double')
model.fit(data_conf)

  0%|          | 0/20 [00:00<?, ?it/s]

In [127]:
item_ids = [i for i in range(100)]

In [135]:
recommeded_clusters = pd.DataFrame(columns = ['user_id', '1st_recommendation', '2nd_recommendation', '3rd_recommendation', '4th_recommendation', '5th_recommendation'])

In [183]:
recomm_ids = {}
for i, id in enumerate(df['user_id'].unique()):
  recomm_ids[id] = model.rank_items(id, sparse_user_item, item_ids)[0][:5]

  This is separate from the ipykernel package so we can avoid doing imports until


In [186]:
recomm_list = []
for id in recomm_ids:
  recomm_list.append(recomm_ids[id])

In [191]:
recomm_clusters = pd.DataFrame(list(map(np.ravel, recomm_list)))

In [193]:
recomm_clusters = recomm_clusters.rename(columns = {0 : '1st_recommendation', 1 : '2nd_recommendation', 2 : '3rd_recommendation', 3 : '4th_recommendation', 4 : '5th_recommendation',})

In [194]:
recomm_clusters

Unnamed: 0,1st_recommendation,2nd_recommendation,3rd_recommendation,4th_recommendation,5th_recommendation
0,41,96,65,0,26
1,41,25,98,37,70
2,5,99,37,55,10
3,42,91,48,59,16
4,5,16,6,40,76
...,...,...,...,...,...
45091,72,33,28,4,95
45092,65,52,87,66,31
45093,70,98,56,21,55
45094,61,62,57,81,30
