## What we're going to do
1. use `pandas` to read csv files as `dataframe` format, pick the columns we want
2. count how many times a product is brought by a specific customer
3. create a pivot dataframe where index is `procudct_name`, column is `customer_id` and the value is count from step 2
4. do KNN clustering to form a base recommendation
5. do `TruncatSVD` and `np.corrcoef` to form a validated recommendation

## 1. Read Data

In [1]:
import pandas as pd

folder = './new_data/'
csvname = 'customer_purchase_data.csv'

filename = folder+csvname
whole_data = pd.read_csv(filename, encoding='utf-8')
whole_data.head(20)

Unnamed: 0.1,Unnamed: 0,customer_unique_id,product_category_name_english,product_main_category,price,flag,cnt
0,0,0000366f3b9a7992bf8c76cfdf3221e2,bed_bath_table,家居生活,129.9,origin,1
1,1,0000b849f77a49e4a4ce2b2a4ca5be3f,health_beauty,保健,18.9,origin,1
2,2,0000b849f77a49e4a4ce2b2a4ca5be3f,books_imported,書籍,125.0,new,1
3,3,0000f46a3911fa3c0805444483337064,stationery,文具,69.0,origin,1
4,4,0000f46a3911fa3c0805444483337064,health_beauty,保健,166.99,new,1
5,5,0000f46a3911fa3c0805444483337064,books_general_interest,書籍,44.9,new,1
6,6,0000f6ccb0745a6a4b88665a16c9f078,telephony,家電,25.99,origin,1
7,7,0000f6ccb0745a6a4b88665a16c9f078,kitchen_dining_laundry_garden_furniture,家居生活,99.99,new,1
8,8,0004aac84e0df4da2b147fca70cf8255,market_place,商業用途,49.9,new,1
9,9,0004aac84e0df4da2b147fca70cf8255,telephony,家電,180.0,origin,1


In [2]:
customer_product_cnt = whole_data[['customer_unique_id', 'product_category_name_english','cnt']]
customer_product_cnt.head()

Unnamed: 0,customer_unique_id,product_category_name_english,cnt
0,0000366f3b9a7992bf8c76cfdf3221e2,bed_bath_table,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,health_beauty,1
2,0000b849f77a49e4a4ce2b2a4ca5be3f,books_imported,1
3,0000f46a3911fa3c0805444483337064,stationery,1
4,0000f46a3911fa3c0805444483337064,health_beauty,1


## 2. Count the numbers

In [3]:
customer_product_sum = customer_product_cnt.groupby(['customer_unique_id', 'product_category_name_english'],as_index=False).count()
print('There are {numbers} of rows in customer_product_cnt'.format(numbers=customer_product_cnt.shape[0]))
print('There are {numbers} of rows in customer_product_sum'.format(numbers=customer_product_sum.shape[0]))

There are 236835 of rows in customer_product_cnt
There are 220621 of rows in customer_product_sum


In [4]:
customer_product_sum.head(20)

Unnamed: 0,customer_unique_id,product_category_name_english,cnt
0,0000366f3b9a7992bf8c76cfdf3221e2,bed_bath_table,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,books_imported,1
2,0000b849f77a49e4a4ce2b2a4ca5be3f,health_beauty,1
3,0000f46a3911fa3c0805444483337064,books_general_interest,1
4,0000f46a3911fa3c0805444483337064,health_beauty,1
5,0000f46a3911fa3c0805444483337064,stationery,1
6,0000f6ccb0745a6a4b88665a16c9f078,kitchen_dining_laundry_garden_furniture,1
7,0000f6ccb0745a6a4b88665a16c9f078,telephony,1
8,0004aac84e0df4da2b147fca70cf8255,market_place,1
9,0004aac84e0df4da2b147fca70cf8255,signaling_and_security,1


compare 16th row of `customer_product_sum` and 17th of `whole_data`, we can see the difference 

## 3. New a pivot table

In [5]:
customer_product_pivot = customer_product_sum.pivot(index = 'product_category_name_english',columns='customer_unique_id', values='cnt').fillna(0)
customer_product_pivot.head()

customer_unique_id,0000366f3b9a7992bf8c76cfdf3221e2,0000b849f77a49e4a4ce2b2a4ca5be3f,0000f46a3911fa3c0805444483337064,0000f6ccb0745a6a4b88665a16c9f078,0004aac84e0df4da2b147fca70cf8255,0004bd2a26a76fe21f786e4fbd80607f,00050ab1314c0e55a6ca13cf7181fecf,00053a61a98854899e70ed204dd4bafe,0005e1862207bf6ccc02e4228effd9a0,0005ef4cd20d2893f0d9fbd94d3c0d97,...,fff699c184bcc967d62fa2c6171765f7,fff7219c86179ca6441b8f37823ba3d3,fff96bc586f78b1f070da28c4977e810,fffa431dd3fcdefea4b1777d114144f2,fffb09418989a0dbff854a28163e47c6,fffcf5a5ff07b0908bd4e2dbc735a684,fffea47cd6d3cc0a88bd621562a9d061,ffff371b4d645b6ecea244b27531430a,ffff5962728ec6157033ef9805bacc48,ffffd2657e2aad2907e67c3e9daecbeb
product_category_name_english,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
agro_industry_and_commerce,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
air_conditioning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arts_and_craftmanship,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
audio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Do clustering

In [6]:
import sklearn.neighbors
print(sorted(sklearn.neighbors.VALID_METRICS['brute']))

['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'cosine', 'dice', 'euclidean', 'hamming', 'haversine', 'jaccard', 'kulsinski', 'l1', 'l2', 'mahalanobis', 'manhattan', 'matching', 'minkowski', 'precomputed', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'wminkowski', 'yule']


In [7]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

customer_product_matrix = csr_matrix(customer_product_pivot.values) # size is same as pivot table
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
#model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'ball_tree', leaf_size=8, n_neighbors=100)
model_knn.fit(customer_product_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [8]:
import numpy as np

# select a number from (0 ~ number of columns), it repreasents that 'suppose an item is bought, ......'
query_index = np.random.choice(customer_product_pivot.shape[0])

distances, indices = model_knn.kneighbors(customer_product_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors =6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(customer_product_pivot.index[query_index]))
        
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, customer_product_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for furniture_bedroom:

1: telephony, with distance of 0.8720085628977003:
2: construction_tools_lights, with distance of 0.9221856761658741:
3: construction_tools_construction, with distance of 0.9312828102842942:
4: small_appliances, with distance of 0.9322155913681228:
5: air_conditioning, with distance of 0.9334715100964475:


## 5. Try matrix decomposition and calculate correlation coefficient

In [9]:
customer_product_pivot2 = customer_product_sum.pivot(index = 'customer_unique_id',columns='product_category_name_english', values='cnt').fillna(0)
customer_product_pivot2.head()

product_category_name_english,agro_industry_and_commerce,air_conditioning,art,arts_and_craftmanship,audio,auto,baby,bed_bath_table,books_general_interest,books_imported,...,security_and_services,signaling_and_security,small_appliances,small_appliances_home_oven_and_coffee,sports_leisure,stationery,tablets_printing_image,telephony,toys,watches_gifts
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000366f3b9a7992bf8c76cfdf3221e2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000b849f77a49e4a4ce2b2a4ca5be3f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000f46a3911fa3c0805444483337064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0000f6ccb0745a6a4b88665a16c9f078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
0004aac84e0df4da2b147fca70cf8255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
from sklearn.decomposition import TruncatedSVD

X = customer_product_pivot2.values.T
print('Shape of our data: ', X.shape)
SVD = TruncatedSVD(n_components=16,random_state=17)
matrix = SVD.fit_transform(X)
print('Shape after decomposition: ', matrix.shape)

Shape of our data:  (71, 92382)
Shape after decomposition:  (71, 16)


In [14]:
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
corr = np.corrcoef(matrix)

product_item_title =  customer_product_pivot2.columns
product_item_title_list = list(product_item_title)

customer_buy = 'air_conditioning'

coffey_hands = product_item_title_list.index(customer_buy)
corr_coffey_hands  = corr[coffey_hands]
print('Recommendations for {0}:\n'.format(customer_buy))
list(product_item_title[(corr_coffey_hands<1.0) & (corr_coffey_hands>0.9)])

Recommendations for air_conditioning:



['air_conditioning',
 'construction_tools_construction',
 'construction_tools_lights',
 'construction_tools_safety',
 'costruction_tools_tools',
 'fixed_telephony',
 'home_appliances',
 'small_appliances',
 'small_appliances_home_oven_and_coffee']