In [1]:
import pandas as pd
import numpy as np
import turicreate as tc

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_excel('Online Retail.xlsx')

In [3]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [5]:
len(data.CustomerID.unique())

4373

In [6]:
data.CustomerID = data.CustomerID.astype(str)
data.StockCode = data.StockCode.astype(str)

In [7]:
# dictionary of products
prod_dict = {}
for code, descr in zip(data.StockCode, data.Description):
    prod_dict[code] = descr

In [8]:
prod_dict['85123A']

'CREAM HANGING HEART T-LIGHT HOLDER'

In [9]:
data = data.loc[:, ['StockCode', 'CustomerID']]

In [10]:
data.head()

Unnamed: 0,StockCode,CustomerID
0,85123A,17850.0
1,71053,17850.0
2,84406B,17850.0
3,84029G,17850.0
4,84029E,17850.0


In [11]:
data = data.groupby(['CustomerID','StockCode']).agg({'StockCode':'count'})

In [12]:
data = data.rename(columns={'StockCode': 'purchase_count'})

In [13]:
#multiply purchase count by the number in original data

In [14]:
data = data.reset_index()
data

Unnamed: 0,CustomerID,StockCode,purchase_count
0,12346.0,23166,2
1,12347.0,16008,1
2,12347.0,17021,1
3,12347.0,20665,1
4,12347.0,20719,4
...,...,...,...
271420,,gift_0001_20,10
271421,,gift_0001_30,8
271422,,gift_0001_40,3
271423,,gift_0001_50,4


### Split the dataset

In [15]:
def split_data(data):
    train, test = train_test_split(data, test_size = 0.2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data
train_data, test_data = split_data(data)


In [16]:
# constant variables to define field names
user_id = 'CustomerID'
item_id = 'StockCode'
users_to_recommend = list(data.CustomerID)
n_rec = 10
n_display = 30

In [17]:
def model(train_data, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    
    model = tc.item_similarity_recommender.create(train_data, user_id = user_id, item_id = item_id, 
                                               target = target, similarity_type = 'cosine')
    
    recom = model.recommend(users = users_to_recommend, k = n_rec)
    recom.print_rows(n_display)
    return model

In [18]:
target = 'purchase_count'

cos = model(train_data,
            user_id, 
            item_id, target, 
            users_to_recommend, 
            n_rec, n_display)


+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|  12346.0   |   23551   | 11.02356505393982  |  1   |
|  12346.0   |   90019B  | 8.314493417739868  |  2   |
|  12346.0   |   23394   | 8.093233108520508  |  3   |
|  12346.0   |   22220   | 7.0830771923065186 |  4   |
|  12346.0   |   84743C  | 7.0300140380859375 |  5   |
|  12346.0   |   22182   | 6.971914768218994  |  6   |
|  12346.0   |   37461   | 6.7555259466171265 |  7   |
|  12346.0   |   90034   | 6.7555259466171265 |  8   |
|  12346.0   |   72802c  | 6.7555259466171265 |  9   |
|  12346.0   |   23604   | 6.7555259466171265 |  10  |
|  12347.0   |   21064   | 3.6291411474347113 |  1   |
|  12347.0   |   90092   | 3.487178881466389  |  2   |
|  12347.0   |   90065B  | 3.487178881466389  |  3   |
|  12347.0   |   35015   | 3.487178881466389  |  4   |
|  12347.0   |   90183c  | 3.487178881466389  |  5   |
|  12347.0

### model evaluation

In [19]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target=target, similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|  12346.0   |   22594   | 8.152214765548706  |  1   |
|  12346.0   |   84880   | 7.8882445096969604 |  2   |
|  12346.0   |   22095   |  7.78916072845459  |  3   |
|  12346.0   |   84754   | 7.465125918388367  |  4   |
|  12346.0   |   37501   | 6.441288352012634  |  5   |
|  12346.0   |   84743C  | 6.186850905418396  |  6   |
|  12346.0   |   84227   | 6.186850905418396  |  7   |
|  12346.0   |   16053   | 5.9452842473983765 |  8   |
|  12346.0   |   10134   | 5.9452842473983765 |  9   |
|  12346.0   |   10123G  | 5.9452842473983765 |  10  |
|  12347.0   |   22409   | 4.0637743791330205 |  1   |
|  12347.0   |   23402   | 3.9533987045288086 |  2   |
|  12347.0   |   20863   | 3.7330388814500233 |  3   |
|  12347.0   |   20850   | 3.7330388814500233 |  4   |
|  12347.0   |   20849   | 3.7330388814500233 |  5   |
|  12347.0

### Create output

In [20]:
def create_output(model, users_to_recommend, n_rec, print_csv = True):
    recommendation = model.recommend(users = users_to_recommend, k = n_rec)
    df_rec = recommendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['CustomerID', 'recommendedProducts']].drop_duplicates().sort_values('CustomerID').set_index('CustomerID')
    if print_csv:
        df_output.to_csv('option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [21]:
df_output = create_output(cos, users_to_recommend, n_rec, print_csv = True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(4373, 1)


Unnamed: 0_level_0,recommendedProducts
CustomerID,Unnamed: 1_level_1
12346.0,23551|90019B|23394|22220|84743C|22182|37461|90...
12347.0,21064|90092|90065B|35015|90183c|20849|51014c|1...
12348.0,21064|22953|16053|90065B|35015|90183c|20849|84...
12349.0,90092|90065B|90183c|35600A|84251F|72802c|35015...
12350.0,90092|35600A|16053|90065B|35015|90183c|20849|8...


### List of recommended items for Customer 12346

In [25]:
[prod_dict[x] for x in list(df_output.loc['12346.0'])[0].split('|')]

['PACK OF 12 PAISLEY PARK TISSUES ',
 'GOLD M.O.P ORBIT BRACELET',
 'POSTE FRANCE CUSHION COVER',
 'CAKE STAND LOVEBIRD 2 TIER WHITE',
 nan,
 'CAKE STAND VICTORIAN FILIGREE SMALL',
 nan,
 'WHITE SILVER NECKLACE SHELL GLASS',
 'VANILLA SCENT CANDLE JEWELLED BOX',
 'SET 10 MINI SANTA & SNOWMAN  17087']