In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [3]:
def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

In [4]:
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

In [5]:
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model

In [32]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
#     scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False)))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
								 .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
#     known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
#     scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + str(i))
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + str(i))
            counter+=1
    return return_score_list

In [7]:
!ls retailrocket-recommender-system-dataset

category_tree.csv  item_properties_part1.csv
events.csv	   item_properties_part2.csv


In [8]:
category_tree = pd.read_csv ("./retailrocket-recommender-system-dataset/category_tree.csv")
category_tree.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [9]:
events = pd.read_csv("./retailrocket-recommender-system-dataset/events.csv")
print(np.unique(events['event'].values,return_counts = True))
events.head()

(array(['addtocart', 'transaction', 'view'], dtype=object), array([  69332,   22457, 2664312]))


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [10]:
# properties = pd.read_csv('./retailrocket-recommender-system-dataset/item_properties_part1.csv')
# properties.head()

In [11]:
def preprocessing(df):
    a = []
    del df['transactionid']
    df.drop_duplicates(inplace=True)
    df = df.iloc[:50000]
    
    del df['timestamp']
    for i in tqdm(range(len(df))):
        if(df.iloc[i][1]=='view'):
            a.append(1)
        elif (df.iloc[i][1]=='addtocart'):
            a.append(2)
        else:
            a.append(3)
    df['rating'] = a
    del df['event']
    df.sort_values(['visitorid','itemid'])
    
    a = []
    prev_rating = 0
    prev = 0
    for i in tqdm(range(len(df))):
        if(i==0):
            prev = list(df.iloc[0])
            prev_rating = df.iloc[0][2]
            continue
        if prev[:1] == list(df.iloc[i][:1]):
            prev_rating+= df.iloc[i][2]
        else:
            prev[2] = prev_rating
            a.append(prev)
            prev_rating = df.iloc[i][2]
            prev = list(df.iloc[i])
    prev[2] = prev_rating
    a.append(prev)
    df1 = pd.DataFrame(data = np.array(a),columns = ['user_col','item_col','rating'])
    return df1

In [12]:
# event = events.copy()
events = preprocessing(events)
events.head()

100%|██████████| 50000/50000 [00:09<00:00, 5071.31it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
100%|██████████| 50000/50000 [00:28<00:00, 1740.32it/s]


Unnamed: 0,user_col,item_col,rating
0,257597,355908,1
1,992329,248676,1
2,111016,318965,1
3,483717,253185,1
4,951259,367447,1


In [13]:
np.unique(events.rating.values,return_counts = True)

(array([1, 2, 3, 4, 6, 7, 9]),
 array([48157,  1257,   402,     6,    16,     1,     1]))

In [14]:
# interactions = create_interaction_matrix(events,'visitorid','itemid','rating')

In [16]:
interactions = events.copy()
interactions = interactions.reset_index().fillna(0).set_index('user_col')

In [17]:
interactions.describe()

Unnamed: 0,index,item_col,rating
count,49840.0,49840.0,49840.0
mean,24919.5,235163.964406,1.0436
std,14387.713045,134233.135334,0.258263
min,0.0,6.0,1.0
25%,12459.75,118388.25,1.0
50%,24919.5,238059.0,1.0
75%,37379.25,351499.5,1.0
max,49839.0,466867.0,9.0


In [18]:
usr_dict = create_user_dict(interactions)
usr_dict

{257597: 0,
 992329: 44069,
 111016: 2,
 483717: 16763,
 951259: 4,
 972639: 12508,
 810725: 12442,
 794181: 7,
 824915: 15669,
 339335: 20458,
 176446: 10,
 929206: 11,
 15795: 8044,
 598426: 8063,
 223343: 16745,
 57036: 35083,
 1377281: 16,
 287857: 20629,
 1370216: 18,
 158090: 12453,
 1398644: 20562,
 653756: 4564,
 1213673: 20531,
 864246: 23,
 125625: 12525,
 608100: 25,
 781127: 16608,
 1076270: 20603,
 453474: 20596,
 1153198: 16688,
 273888: 30,
 849453: 142,
 487887: 32,
 629333: 2788,
 1130165: 34,
 361387: 44006,
 112175: 36,
 860082: 7969,
 784669: 38,
 1061147: 39,
 485456: 40,
 1342963: 15900,
 969887: 42,
 759369: 12550,
 1282360: 16784,
 233317: 20610,
 392042: 16681,
 591038: 47,
 692195: 48,
 432882: 49,
 808133: 51,
 180680: 52,
 1151716: 53,
 597724: 54,
 179437: 55,
 794013: 12484,
 975530: 7985,
 741702: 58,
 1219180: 59,
 1143908: 41477,
 1078178: 61,
 503970: 28131,
 1193904: 4445,
 376913: 8018,
 1262470: 8041,
 238317: 1137,
 800456: 16609,
 1219627: 69,
 52

In [20]:
item_dict = create_item_dict(events,'item_col','item_col')
item_dict

{355908: 355908,
 248676: 248676,
 318965: 318965,
 253185: 253185,
 367447: 367447,
 22556: 22556,
 443030: 443030,
 439202: 439202,
 428805: 428805,
 82389: 82389,
 10572: 10572,
 410676: 410676,
 44872: 44872,
 156489: 156489,
 402625: 402625,
 334662: 334662,
 251467: 251467,
 5206: 5206,
 176721: 176721,
 135256: 135256,
 132316: 132316,
 343861: 343861,
 36642: 36642,
 17655: 17655,
 187722: 187722,
 21989: 21989,
 262799: 262799,
 250696: 250696,
 388242: 388242,
 205392: 205392,
 123990: 123990,
 345560: 345560,
 128394: 128394,
 45337: 45337,
 43485: 43485,
 430845: 430845,
 22926: 22926,
 181743: 181743,
 280893: 280893,
 216707: 216707,
 306886: 306886,
 159780: 159780,
 386527: 386527,
 106564: 106564,
 75994: 75994,
 280375: 280375,
 367500: 367500,
 112792: 112792,
 102061: 102061,
 16813: 16813,
 128499: 128499,
 456784: 456784,
 427725: 427725,
 134264: 134264,
 335975: 335975,
 51354: 51354,
 55555: 55555,
 350819: 350819,
 422768: 422768,
 417464: 417464,
 323147: 323

In [21]:
model = runMF(interactions)

In [33]:
sample_recommendation_user(model,interactions,287857,usr_dict,item_dict)

Known Likes:
1- 287857
2- 287857

 Recommended Items:
1- 0.21256288886070251
2- 0.1672142893075943
3- -0.30499938130378723


[0.21256288886070251, 0.1672142893075943, -0.30499938130378723]