In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('ecommerce_data.csv',encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
df.shape

(541909, 8)

In [5]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [6]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [7]:
df.nunique()


InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


Negative values in quantity and unit price does not make sense, so dropping negative values

In [9]:
df = df[df['Quantity']>0]
df = df[df['UnitPrice']>0]

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(397884, 8)

In [12]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

# Building a customer item matrix

1. Creating a matrix that contains the customer ID as index and each item as a column
2. Using Pivot fn to use Customer ID as index and Stockcode as column
3. Using 'Quantity' as the values to dsiplay and aggregate function to sum up these values

In [13]:
customer_item_matrix = df.pivot_table(index='CustomerID',columns = 'StockCode',values='Quantity',aggfunc='sum')
customer_item_matrix.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,,,,,,,,,,,...,,,,,,,,,,
12347.0,,,,,,,,,,,...,,,,,,,,,,
12348.0,,,,,,,,,,,...,,,,,,,,,,9.0
12349.0,,,,,,,,,,,...,,,,,,,,,,1.0
12350.0,,,,,,,,,,,...,,,,,,,,,,1.0


We dont need the exact numbers of quantities, so changing it to 1s and 0s

In [14]:
customer_item_matrix =customer_item_matrix.applymap(lambda x: 1 if x>0 else 0)
customer_item_matrix.head()

  customer_item_matrix =customer_item_matrix.applymap(lambda x: 1 if x>0 else 0)


StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12349.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [15]:
customer_item_matrix.shape

(4338, 3665)

### Item based collaborative filtering

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine similarity is the angle distance between two data points. It is a similarity measure

In [17]:
item_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T))
item_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3655,3656,3657,3658,3659,3660,3661,3662,3663,3664
0,1.0,0.0,0.094868,0.091287,0.0,0.0,0.090351,0.062932,0.098907,0.095346,...,0.0,0.0,0.0,0.0,0.0,0.029361,0.0,0.067591,0.0,0.078217
1,0.0,1.0,0.0,0.0,0.0,0.0,0.032774,0.045655,0.047836,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016345,0.0,0.0
2,0.094868,0.0,1.0,0.11547,0.0,0.0,0.057143,0.059702,0.041703,0.060302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071247,0.0,0.010993
3,0.091287,0.0,0.11547,1.0,0.0,0.0,0.164957,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.447214,0.063888,0.044499,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
item_matrix.shape

(3665, 3665)

In [19]:
item_matrix.columns = customer_item_matrix.T.index
item_matrix['StockCode']= customer_item_matrix.T.index
item_matrix =item_matrix.set_index('StockCode')
item_matrix.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.0,0.0,0.094868,0.091287,0.0,0.0,0.090351,0.062932,0.098907,0.095346,...,0.0,0.0,0.0,0.0,0.0,0.029361,0.0,0.067591,0.0,0.078217
10080,0.0,1.0,0.0,0.0,0.0,0.0,0.032774,0.045655,0.047836,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016345,0.0,0.0
10120,0.094868,0.0,1.0,0.11547,0.0,0.0,0.057143,0.059702,0.041703,0.060302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071247,0.0,0.010993
10123C,0.091287,0.0,0.11547,1.0,0.0,0.0,0.164957,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,1.0,0.447214,0.063888,0.044499,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# finding most similar items
item_matrix.loc['10080'].sort_values(ascending=False)

StockCode
10080     1.000000
23694     0.191346
22039     0.187317
47504H    0.166924
21650     0.165567
            ...   
22760     0.000000
22763     0.000000
22764     0.000000
22765     0.000000
POST      0.000000
Name: 10080, Length: 3665, dtype: float64

In [21]:
top_5 = list(item_matrix.loc['22760'].sort_values(ascending=False).iloc[:5].index)
top_5

['22760', '90103', '22797', '23094', '22829']

In [22]:
# row info for a specific item

df.loc[df['StockCode'] == '23094'][:1]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
184939,552721,23094,LE GRAND TRAY CHIC SET,6,5/11/2011 9:55,12.5,14691.0,United Kingdom


In [23]:
df.loc[df['StockCode'].isin(top_5),['StockCode','Description']].drop_duplicates().set_index('StockCode').loc[top_5]

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
22760,"TRAY, BREAKFAST IN BED"
90103,PURPLE FRANGIPANI NECKLACE
22797,CHEST OF DRAWERS GINGHAM HEART
23094,LE GRAND TRAY CHIC SET
22829,SWEETHEART WIRE WALL TIDY


In [24]:
def get_recomm_items(prod):
    new = df[['StockCode','Description']].drop_duplicates().set_index('StockCode')
    code = new[new['Description']== prod].index
    top_5 = list(item_matrix.loc[code[0]].sort_values(ascending=False).iloc[:6].index)
    top_5 = df.loc[df['StockCode'].isin(top_5),['StockCode','Description']].set_index('StockCode').drop_duplicates().loc[top_5]
    return top_5

In [25]:
recom = get_recomm_items('PETIT TRAY CHIC')
new = recom[1:]
pr = new['Description'].tolist()
try:
    for index,val in enumerate(pr,start=1):
        print(index, pr[index])
except:
    pass

1 IVORY WICKER HEART LARGE
2 HEART IVORY TRELLIS SMALL
3 SMALL WHITE HEART OF WICKER
4 METAL 4 HOOK HANGER FRENCH CHATEAU
