In [7]:
import numpy as np  
import pandas as pd
import scipy.sparse as sparse

from scipy.sparse.linalg import spsolve
customer_list1 = pd.read_csv("E:\\Business analytics with R\\train.csv") 
customer_list = pd.read_csv("E:\\Business analytics with R\\test.csv")
customer_list1.head(2)  

Unnamed: 0,CustomerID,InvoiceNo,Quantity,InvoiceDate,UnitPrice,Country,StockCode
0,27270,27270,7,1/12/2010 8:26,2.55,PX,85123AY
1,27270,27270,7,1/12/2010 8:26,3.39,PX,71053R


In [5]:
customer_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330575 entries, 0 to 330574
Data columns (total 7 columns):
CustomerID     330575 non-null int64
InvoiceNo      330575 non-null int64
Quantity       330575 non-null int64
InvoiceDate    330575 non-null object
UnitPrice      330575 non-null float64
Country        330575 non-null object
StockCode      330575 non-null object
dtypes: float64(1), int64(3), object(3)
memory usage: 17.7+ MB


In [8]:
customer_list.head(2)

Unnamed: 0,Country,CustomerID,InvoiceDate,InvoiceNo,Quantity,StockCode,UnitPrice
0,PX,127269,01/12/10 8:28,127269,7,22633V,1.85
1,PX,227268,01/12/10 8:34,227268,38,84879M,1.69


In [9]:
item_lookup = customer_list[['StockCode', 'UnitPrice']].drop_duplicates() # unique item/UnitPrice pairs
item_lookup['StockCode'] = item_lookup.StockCode.astype(str) # Encode as strings

In [60]:
item_lookup.head()

Unnamed: 0,StockCode,UnitPrice
0,22633V,1.85
1,84879M,1.69
2,22748P,2.1
3,22749K,3.75
4,22622G,9.95


In [10]:
customer_list['CustomerID'] = customer_list.CustomerID.astype(int)
customer_list = customer_list[['StockCode', 'Quantity', 'CustomerID']]
grouped_customerlist = customer_list.groupby(['CustomerID', 'StockCode']).sum().reset_index() 
grouped_customerlist.Quantity.loc[grouped_customerlist.Quantity == 0] = 1 # indicate purchased
grouped_purchased = grouped_customerlist.query('Quantity > 0') # show customers whose purchase totals were positive

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
grouped_purchased.head()

Unnamed: 0,CustomerID,StockCode,Quantity
0,1890,21080R,4
1,1890,21094A,13
2,1890,21770E,2
3,1890,22366J,2
4,1890,22413V,7


In [12]:
customers = list(np.sort(customer_list.CustomerID.unique()))
products = list(customer_list.StockCode.unique())
quantity = list(customer_list.Quantity)

rows = customer_list.CustomerID.astype('category', categories = customers).cat.codes 
cols = customer_list.StockCode.astype('category', categories = products).cat.codes 
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
purchases_sparse

<628x3522 sparse matrix of type '<class 'numpy.int32'>'
	with 85260 stored elements in Compressed Sparse Row format>

In [14]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # possible interactions
num_purchases = len(purchases_sparse.nonzero()[0]) #  no items interacted 
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

96.14787125149651

In [15]:
import random

In [16]:
def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() 
    test_set[test_set != 0] = 1 # binary preference matrix
    training_set = ratings.copy() 
    nonzero_inds = training_set.nonzero() # interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # convert user,item index into list
    random.seed(0) 
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) #roundoff
    samples = random.sample(nonzero_pairs, num_samples) # random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # user row indices
    item_inds = [index[1] for index in samples] # item column indices
    training_set[user_inds, item_inds] = 0 #randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array
    return training_set, test_set, list(set(user_inds)) # unique list of user rows 

In [17]:
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [158]:
def implicit_weighted_ALS(training_set, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):
    conf = (alpha*training_set) 
    num_user = conf.shape[0]
    num_item = conf.shape[1] 
    rstate = np.random.RandomState(seed)
    X = sparse.csr_matrix(rstate.normal(size = (num_user, rank_size))) # users in a m x rank shape
    Y = sparse.csr_matrix(rstate.normal(size = (num_item, rank_size))) 
    X_eye = sparse.eye(num_user) 
    Y_eye = sparse.eye(num_item)
    lambda_eye = lambda_val * sparse.eye(rank_size) 
    for iter_step in range(iterations): #iterations
        #tuning
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)
        #fixed y
        for u in range(num_user):
            conf_samp = conf[u,:].toarray() 
            pref = conf_samp.copy() 
            pref[pref != 0] = 1 # binarized preference vector 
            CuI = sparse.diags(conf_samp, [0])  #Ci - I term
            yTCuIY = Y.T.dot(CuI).dot(Y) # yT(Cu-I)Y term 
            yTCupu = Y.T.dot(CuI + Y_eye).dot(pref.T) #yTCuPu term
            X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu) #minimization
        #fixed X 
        for i in range(num_item):
            conf_samp = conf[:,i].T.toarray() #  row format
            pref = conf_samp.copy()
            pref[pref != 0] = 1 
            CiI = sparse.diags(conf_samp, [0]) 
            xTCiIX = X.T.dot(CiI).dot(X) 
            xTCiPi = X.T.dot(CiI + X_eye).dot(pref.T) 
            Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi)
        return X, Y.T 

In [159]:
user_vecs, item_vecs = implicit_weighted_ALS(product_train, lambda_val = 0.1, alpha = 15, iterations = 1,
                                            rank_size = 20)

In [163]:
user_vecs[0,:].dot(item_vecs).toarray()[0,:5]

array([-0.05326089,  0.06395472, -0.01473672, -0.01574714,  0.06518385])

In [164]:
import implicit

In [165]:
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), 
                                                          factors=20, 
                                                           regularization = 0.1, 
                                                         iterations = 50)

100%|████████████████████████████████████████████████████████████████████████████████| 50.0/50 [00:00<00:00, 78.69it/s]


In [166]:
from sklearn import metrics

In [167]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)   

In [168]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
    store_auc = [] 
    popularity_auc = [] 
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: 
        training_row = training_set[user,:].toarray().reshape(-1) 
        zero_inds = np.where(training_row == 0) #check for interactions
        # predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        # items that were originally zero
        # Selecting for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] #  popularity for our chosen items
        store_auc.append(auc_score(pred, actual)) 
        popularity_auc.append(auc_score(pop, actual))

    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
   

In [169]:
calc_mean_auc(product_train, product_users_altered, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], product_test)

(0.72, 0.803)

In [170]:
customers_arr = np.array(customers) 
products_arr = np.array(products) 

In [171]:
def get_items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    cust_ind = np.where(customers_list == customer_id)[0][0] #index row of cust id
    purchased_ind = mf_train[cust_ind,:].nonzero()[1] # index of purchased items
    prod_codes = products_list[purchased_ind] #stock codes for our purchased items
    return item_lookup.loc[item_lookup.StockCode.isin(prod_codes)]

In [172]:
customers_arr[:5]

array([1890, 2700, 3600, 3690, 4500], dtype=int64)

In [30]:
get_items_purchased(3600, product_train, customers_arr, products_arr, item_lookup).head()

Unnamed: 0,StockCode,UnitPrice
0,22633V,1.85
19,22726C,3.75
51,22114V,3.95
57,48185J,7.95
63,22111I,4.95


In [31]:
from sklearn.preprocessing import MinMaxScaler

In [137]:
def recommend_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() #rating from train set
    pref_vec = pref_vec.reshape(-1) + 1 #to make every item positive
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) #dot product
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]  
    recommend_vector = pref_vec*rec_vector_scaled #scaling
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] #sorting in best of rec
    rec_list = [] 
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.UnitPrice.loc[item_lookup.StockCode == code].iloc[0]]) 
        rec_list.append([code, item_lookup.UnitPrice.loc[item_lookup.StockCode == code].iloc[0]]) 
    codes = [item[0] for item in rec_list]
    UnitPrice1 = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'StockCode': codes, 'UnitPrice': UnitPrice1}) 
    return final_frame[['StockCode', 'UnitPrice']] 


In [173]:
recommend_items(129069
, product_train, user_vecs, item_vecs, customers_arr, products_arr, item_lookup,
                       num_items = 30)

Unnamed: 0,StockCode,UnitPrice
0,22355U,1.66
1,21746N,2.51
2,22086O,5.91
3,84077K,0.29
4,20724F,1.66
5,84945I,0.85
6,22791E,1.25
7,23232F,0.42
8,84347V,2.55
9,21967J,0.29


In [148]:
#def predict(customer_list):
Customers = list(np.sort(customer_list.CustomerID.unique()))
final_frame1 = pd.DataFrame({'Customer': Customers})
for c in final_frame1.loc[:,'Customer']:
    print (recommend_items(c,product_train, user_vecs, item_vecs, customers_arr, products_arr, item_lookup,
                       num_items = 1),c)

  StockCode  UnitPrice
0    22851J       1.66 1890
  StockCode  UnitPrice
0    23205F       0.85 2700
  StockCode  UnitPrice
0    48194E      14.43 3600
  StockCode  UnitPrice
0   85099BJ       1.65 3690
  StockCode  UnitPrice
0    22536B       0.85 4500
  StockCode  UnitPrice
0    22188M       3.95 5490
  StockCode  UnitPrice
0    21733F       2.95 8190
  StockCode  UnitPrice
0    23300F       1.65 9090
  StockCode  UnitPrice
0    20972O       1.25 13680
  StockCode  UnitPrice
0    21135E       1.69 14490
  StockCode  UnitPrice
0    23307D       0.55 14580
  StockCode  UnitPrice
0    23208E       1.65 15390
  StockCode  UnitPrice
0    21122K       1.06 18090
  StockCode  UnitPrice
0    22151L       0.42 18180
  StockCode  UnitPrice
0    22693N       1.25 19980
  StockCode  UnitPrice
0    21175V        2.1 22770
  StockCode  UnitPrice
0    22492E       1.66 23580
  StockCode  UnitPrice
0    16259V       0.21 24480
  StockCode  UnitPrice
0    22178V       2.51 24570
  StockCode  UnitPri

0    84755J       1.27 144549
  StockCode  UnitPrice
0    23355Z       4.15 145359
  StockCode  UnitPrice
0   16161PA       0.42 145449
  StockCode  UnitPrice
0    21977R       0.42 145458
  StockCode  UnitPrice
0    22386V       1.95 146349
  StockCode  UnitPrice
0    22178V       2.51 146358
  StockCode  UnitPrice
0    22616B       0.29 146448
  StockCode  UnitPrice
0    22197J       0.85 147258
  StockCode  UnitPrice
0    22616B       0.29 148059
  StockCode  UnitPrice
0    37413U       0.39 148149
  StockCode  UnitPrice
0    22102P       1.65 149049
  StockCode  UnitPrice
0    22197J       0.85 150948
  StockCode  UnitPrice
0    22178V       2.51 151749
  StockCode  UnitPrice
0    84568Z       0.21 151848
  StockCode  UnitPrice
0   85049EP       1.25 152739
  StockCode  UnitPrice
0    15036Z       0.75 152748
  StockCode  UnitPrice
0   90058BX       0.38 152838
  StockCode  UnitPrice
0    22952P       0.55 153549
  StockCode  UnitPrice
0    22178V       2.51 153648
  StockCode  Uni

0    84946F       2.51 276228
  StockCode  UnitPrice
0    22791E       1.25 276327
  StockCode  UnitPrice
0    23084M       2.08 276417
  StockCode  UnitPrice
0    23334M       0.63 277128
  StockCode  UnitPrice
0    20975Y       1.28 277218
  StockCode  UnitPrice
0    22998M       0.42 278217
  StockCode  UnitPrice
0    21790Y       0.85 280818
  StockCode  UnitPrice
0    21212D       0.55 281907
  StockCode  UnitPrice
0    21787C       1.66 284607
  StockCode  UnitPrice
0    17003M       0.42 285417
  StockCode  UnitPrice
0    21891R       1.25 285507
  StockCode  UnitPrice
0    85212R       1.66 291897
  StockCode  UnitPrice
0    22066A       1.45 293697
  StockCode  UnitPrice
0    20713F       1.95 294507
  StockCode  UnitPrice
0    22951B       0.55 294597
  StockCode  UnitPrice
0    23340L       1.65 295497
  StockCode  UnitPrice
0   85099FC       1.95 298197
  StockCode  UnitPrice
0    22386V       1.95 301797
  StockCode  UnitPrice
0    16225Z       3.36 301887
  StockCode  Uni

0    23194R       2.25 397296
  StockCode  UnitPrice
0   17012CF       2.51 398106
  StockCode  UnitPrice
0    85152E        2.1 398196
  StockCode  UnitPrice
0    23431D       0.72 399096
  StockCode  UnitPrice
0    22197J       0.85 399996
  StockCode  UnitPrice
0    21985C       0.29 401796
  StockCode  UnitPrice
0   84970SL       2.13 401886
  StockCode  UnitPrice
0    82600H       4.21 402786
  StockCode  UnitPrice
0    71459J       0.85 403596
  StockCode  UnitPrice
0    22419N       0.85 403686
  StockCode  UnitPrice
0    20733F       0.85 405396
  StockCode  UnitPrice
0    22534C       0.85 406386
  StockCode  UnitPrice
0    84879M       1.69 407286
  StockCode  UnitPrice
0    21731V       3.36 408186
  StockCode  UnitPrice
0    22492E       1.66 411786
  StockCode  UnitPrice
0    20725N       1.65 413676
  StockCode  UnitPrice
0    37342P       0.85 415386
  StockCode  UnitPrice
0    21112W       1.25 415476
  StockCode  UnitPrice
0    23068E       2.08 416286
  StockCode  Uni

0    23084M       2.08 567225
  StockCode  UnitPrice
0    21213V       0.55 569925
  StockCode  UnitPrice
0    23084M       2.08 570825
  StockCode  UnitPrice
0    22616B       0.29 574425
  StockCode  UnitPrice
0    22296X       1.65 575415
  StockCode  UnitPrice
0    22910D       5.91 576225
  StockCode  UnitPrice
0    22079R       1.65 579015
