In [1]:
import implicit
import surprise

In [2]:
# !export MKL_NUM_THREADS=1

In [3]:
als = implicit.als.AlternatingLeastSquares(factors=32, use_gpu=False, iterations=50)
svd = surprise.SVD(n_factors=20)



In [4]:
data_path = '/home/subhasis/Dataset/Analytics Vidya Contests/Black Friday/train.csv'

In [5]:
data = pd.read_csv(data_path)

In [6]:
data.shape

(550068, 12)

In [7]:
data.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [8]:
X = data[['User_ID','Product_ID','Purchase']]

In [9]:
# X.loc[:,'User_ID'] = X.loc[:,'User_ID'].astype('category')
# X.loc[:,'Product_ID'] = X.loc[:,'Product_ID'].astype('category')

In [10]:
np.random.seed(0)
random_index = np.array(range(data.shape[0]))
np.random.shuffle(random_index)

In [11]:
random_index

array([ 41001,  16151, 507262, ..., 117952, 435829, 305711])

In [12]:
train_index = random_index[:400000]
test_index = random_index[400000:]

In [13]:
train = X.iloc[train_index,:]
test = X.iloc[test_index,:]

In [14]:
train.loc[:,'Purchase'] = train.loc[:,'Purchase']/1000
test.loc[:,'Purchase'] = test.loc[:,'Purchase']/1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


# METHOD 1 (SVD based)

In [15]:
r = surprise.Reader(rating_scale=(0,24))
D = surprise.dataset.Dataset.load_from_df(train, reader=r)

In [16]:
surprise.model_selection.cross_validate(svd,D,cv=5)

{'test_rmse': array([2.71296779, 2.72183281, 2.71114141, 2.7203522 , 2.69693933]),
 'test_mae': array([1.9528483 , 1.96358108, 1.95045842, 1.9597839 , 1.94810775]),
 'fit_time': (9.421171188354492,
  9.640877962112427,
  9.584179401397705,
  9.792511224746704,
  9.554441452026367),
 'test_time': (0.8469557762145996,
  0.8154988288879395,
  0.7159903049468994,
  0.7586517333984375,
  0.7120351791381836)}

In [17]:
d1=D.build_full_trainset()
svd.fit(d1)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f68c54bbf10>

In [18]:
pred_test = []
for u,i in zip(test.User_ID,test.Product_ID):
    pred = svd.predict(uid=u,iid=i)[3]
    pred = pred
    pred_test.append(pred)

In [19]:
len(test.Purchase)

150068

In [20]:
np.sqrt(np.mean((test.Purchase - np.array(pred_test))**2))

2.6732013615575108

In [29]:
pred_test[:5]

[11.175780044444748,
 6.802796259992419,
 5.122089318551225,
 11.730305244877513,
 8.423767721966986]

In [30]:
test.head()

Unnamed: 0,User_ID,Product_ID,Purchase
339574,1004258,P00190042,10.86
435979,1001137,P00150142,9.819
77715,1005975,P00188042,1.755
354651,1000664,P00111542,15.921
40769,1000271,P00115642,9.82


# METHOD 2 (ALS)

In [21]:
import scipy.sparse as sparse

In [22]:
user_dict = {u:i for u, i in zip(sorted(X.User_ID),range(len(X)))}
product_dict = {p:j for p, j in zip(sorted(X.Product_ID),range(len(X)))}

In [23]:
X_new = X.copy()
X_new.loc[:,'User_ID'] = X_new.loc[:,'User_ID'].map(user_dict)
X_new.loc[:,'Product_ID'] = X_new.loc[:,'Product_ID'].map(product_dict)

In [24]:
train_new = X_new.iloc[train_index,:]
test_new = X_new.iloc[test_index,:]
train_new.loc[:,'Purchase'] = train_new.loc[:,'Purchase']/1000
test_new.loc[:,'Purchase'] = test_new.loc[:,'Purchase']/1000

In [25]:
train_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400000 entries, 41001 to 290446
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   User_ID     400000 non-null  int64  
 1   Product_ID  400000 non-null  int64  
 2   Purchase    400000 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 12.2 MB


In [26]:
item_user_sparse = sparse.csr_matrix((train_new['Purchase'],(train_new['Product_ID'],train_new['User_ID'])))
user_item_sparse = sparse.csr_matrix((train_new['Purchase'],(train_new['User_ID'],train_new['Product_ID'])))

In [27]:
als.fit(item_user_sparse)

  0%|          | 0/50 [00:00<?, ?it/s]

In [28]:
als.recommend(user_dict[1000100],user_item_sparse)

[(162972, 1.0758896),
 (153953, 0.9753461),
 (377532, 0.9154431),
 (210052, 0.90681964),
 (205797, 0.86762184),
 (106855, 0.8192686),
 (308418, 0.8099663),
 (71355, 0.7901915),
 (48114, 0.78624856),
 (210547, 0.77419865)]