<img src="http://www.codeheroku.com/static/blog/images/pid14_rs_diff.png">

使用scikit-learn library進行基於內容的推薦系統。

Suppose, you are given the following two texts:

Text A: London Paris London

Text B: Paris Paris London

How would you find the similarity between Text A and Text B?

1. Text A: Contains the word “London” 2 times and the word “Paris” 1 time.
2. Text B: Contains the word “London” 1 time and the word “Paris” 2 times.
<img src="http://www.codeheroku.com/static/blog/images/pid14_find_cos_theta.png">

In [1]:
text = ["London Paris London","Paris Paris London"]

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(text)

print(cv.get_feature_names())
print(count_matrix.toarray())

['london', 'paris']
[[2 1]
 [1 2]]


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(count_matrix)
print(similarity_scores)

[[1.  0.8]
 [0.8 1. ]]


=====================================================

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None 

In [133]:
# Step 1:Read CSV File
data = pd.read_csv('./data_file/product.csv')
data.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [134]:
df = data.copy()

In [135]:
data = data[~data.COMMODITY_DESC.isin([' ','COUPON/MISC ITEMS','(CORP USE ONLY)','FUEL','NO COMMODITY DESCRIPTION'])]

In [136]:
df = data[:10000]
df = df.reset_index().drop('index', axis=1)
df['index'] = pd.Series(range(1,10001) )
df['PRODUCT_ID'] = 'P' + (df['PRODUCT_ID'].astype(str))

In [137]:
df_p = df.copy()

In [138]:
df_p[['DEPARTMENT','BRAND','COMMODITY_DESC','SUB_COMMODITY_DESC']]

Unnamed: 0,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC
0,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED
1,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH
2,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE
3,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES
4,GROCERY,Private,SPICES & EXTRACTS,SPICES & SEASONINGS
...,...,...,...,...
9995,GROCERY,National,TEAS,TEA BAGS & BULK TEA
9996,GROCERY,Private,SOUP,CONDENSED SOUP
9997,GROCERY,National,MISC WINE,BEVERAGE WINES
9998,MEAT-PCKGD,National,LUNCHMEAT,POULTRY


In [139]:
df.head(20)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT,index
0,P25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1
1,P26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,2
2,P26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,3
3,P26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ,4
4,P26426,69,GROCERY,Private,SPICES & EXTRACTS,SPICES & SEASONINGS,2.5 OZ,5
5,P26540,69,GROCERY,Private,COOKIES/CONES,TRAY PACK/CHOC CHIP COOKIES,16 OZ,6
6,P26601,69,DRUG GM,Private,VITAMINS,VITAMIN - MINERALS,300CT(1),7
7,P26636,69,PASTRY,Private,BREAKFAST SWEETS,SW GDS: SW ROLLS/DAN,,8
8,P26691,16,GROCERY,Private,PNT BTR/JELLY/JAMS,HONEY,12 OZ,9
9,P26738,69,GROCERY,Private,ICE CREAM/MILK/SHERBTS,TRADITIONAL,56 OZ,10


In [140]:
# Step 2: Select Features
# 選擇產品特徵
features = ['DEPARTMENT','BRAND','COMMODITY_DESC','SUB_COMMODITY_DESC']

In [141]:
# Step 3: Create a column in DF which combines all selected features
def combine_features(row):
    return row['DEPARTMENT']+" "+row['BRAND']+" "+row['COMMODITY_DESC']+" "+row['SUB_COMMODITY_DESC']

for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string

df["combined_features"] = df.apply(combine_features,axis=1) #applying combined_features() method over each rows of dataframe and storing the combined string in "combined_features" column


In [146]:
df.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT,index,combined_features
0,P25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1,GROCERY National FRZN ICE ICE - CRUSHED/CUBED
1,P26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,2,PASTRY Private BREAD BREAD:ITALIAN/FRENCH
2,P26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,3,GROCERY Private FRUIT - SHELF STABLE APPLE SAUCE
3,P26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ,4,GROCERY Private COOKIES/CONES SPECIALTY COOKIES
4,P26426,69,GROCERY,Private,SPICES & EXTRACTS,SPICES & SEASONINGS,2.5 OZ,5,GROCERY Private SPICES & EXTRACTS SPICES & SEA...


In [142]:
# Step 4: Create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])

In [116]:
print(cv.get_feature_names()[:100])
print(count_matrix.toarray())
print(len(count_matrix.toarray()))

['10', '100', '12', '15', '15lbs', '15pk', '16', '18', '18in', '20pk', '24pk', '42', '42p', '4pk', '50', '5lt', '6pk', '750ml', '8pc', '90', '99', 'abrasives', 'acc', 'access', 'accessiores', 'accessories', 'accss', 'acids', 'acne', 'action', 'activewear', 'activity', 'add', 'added', 'additi', 'additives', 'adhesives', 'adidas', 'adult', 'adults', 'aerosol', 'aerosols', 'aftersun', 'age', 'aid', 'aids', 'air', 'albums', 'alcoholic', 'ale', 'ales', 'alkaline', 'all', 'allergy', 'allieds', 'almay', 'alpo', 'alternative', 'alternatives', 'aluminum', 'american', 'ammonia', 'analgesics', 'and', 'angel', 'animal', 'anjou', 'annuals', 'ant', 'antacids', 'anti', 'antibiotic', 'antiperspirants', 'apparel', 'appetizers', 'apple', 'apples', 'appliances', 'applicators', 'aquarium', 'area', 'aseptic', 'asian', 'asparagus', 'aspic', 'audio', 'australian', 'authentic', 'auto', 'automatic', 'automobile', 'automotive', 'aviation', 'avocado', 'ba', 'baby', 'babyfood', 'back', 'bacon', 'bag']
[[0 0 0 ...

In [117]:
# Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)

In [118]:
print(cosine_sim)

[[1.         0.         0.12598816 ... 0.27216553 0.1490712  0.12598816]
 [0.         1.         0.13363062 ... 0.         0.         0.        ]
 [0.12598816 0.13363062 1.         ... 0.15430335 0.         0.        ]
 ...
 [0.27216553 0.         0.15430335 ... 1.         0.18257419 0.15430335]
 [0.1490712  0.         0.         ... 0.18257419 1.         0.16903085]
 [0.12598816 0.         0.         ... 0.15430335 0.16903085 1.        ]]


In [119]:
def get_title_from_index(index):
    return df[df.index == index]["PRODUCT_ID"].values[0]
def get_index_from_title(title):
    return df[df.PRODUCT_ID == title]["index"].values[0]

In [127]:
# Step 6: Get index of this product from its title
product_user_likes = "P26093"
product_index = get_index_from_title(product_user_likes)
sim_products = list(enumerate(cosine_sim[product_index])) 

In [128]:
sim_products[:5]

[(0, 0.1259881576697424),
 (1, 0.13363062095621217),
 (2, 0.9999999999999997),
 (3, 0.26726124191242434),
 (4, 0.26726124191242434)]

In [129]:
# Step 7: Get a list of similar product in descending order of similarity score
sorted_sim_products = sorted(sim_products,key=lambda x:x[1],reverse=True)[1:]


In [130]:
# 只抓前10個
i=0
recommend_list = []
print("Top 10 similar products to "+product_user_likes+" are:\n")
for product in sorted_sim_products:
    print(get_title_from_index(product[0]))
    recommend_list.append(get_title_from_index(product[0]))
    i=i+1
    if i>10:
        break

Top 10 similar products to P26093 are:

P545043
P611698
P811904
P853643
P27503
P45218
P45607
P59994
P62438
P62812
P65291


In [131]:
recommend_pd = pd.DataFrame(recommend_list,columns =['PRODUCT_ID'])

In [132]:
recommend_pd.merge(df_p, how='left', on ='PRODUCT_ID')

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT,index
0,P545043,16,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,4400
1,P611698,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,4730
2,P811904,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,5754
3,P853643,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,9802
4,P27503,69,GROCERY,Private,FRUIT - SHELF STABLE,CRANBERRY SAUCE,16 OZ,25
5,P45218,69,GROCERY,Private,FRUIT - SHELF STABLE,PEARS,15.25 OZ,610
6,P45607,69,GROCERY,Private,FRUIT - SHELF STABLE,PINEAPPLE,8 OZ,633
7,P59994,69,GROCERY,Private,FRUIT - SHELF STABLE,PEACHES,8.25 OZ,994
8,P62438,69,GROCERY,Private,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,1051
9,P62812,69,GROCERY,Private,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,1061
