In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import warnings

import pythainlp
from pythainlp import word_tokenize
from pythainlp.corpus import get_corpus

## Read Data

In [2]:
ds = pd.read_csv('Database/new_all_hp_newdel.csv')
ds.head()

Unnamed: 0,hp_article,hp_prod_desc_th40,hp_mch_id,hp_brand
0,285558,ลูกบิดทั่วไป VECO 3871SS-ML-ETหัวกลม SS,DH010101,VECO
1,285559,ลูกบิดห้องน้ำ VECO3871SS-ML-BKหัวกลม SS,DH010103,VECO
2,261589,เขาควายทั่วไป SOLEX 1615 SN/R SS,DH010401,SOLEX
3,1013621,กล่องเก็บของ 10.5L 5122 JCJ IVA M ขาว,HO020102,JCJ
4,1037657,ม่านEyelet 19072SOR 240X250 AUBURN PSY,FD010202,PASAYA


In [3]:
ds = ds[['hp_article','hp_prod_desc_th40', 'hp_mch_id']]

In [4]:
ds = ds.dropna()

In [5]:
ds.rename(columns = {'hp_article':'id'}, inplace = True) 
ds.rename(columns = {'hp_prod_desc_th40':'description'}, inplace = True)
ds.rename(columns = {'hp_mch_id':'mch_id'}, inplace = True)
ds

Unnamed: 0,id,description,mch_id
0,285558,ลูกบิดทั่วไป VECO 3871SS-ML-ETหัวกลม SS,DH010101
1,285559,ลูกบิดห้องน้ำ VECO3871SS-ML-BKหัวกลม SS,DH010103
2,261589,เขาควายทั่วไป SOLEX 1615 SN/R SS,DH010401
3,1013621,กล่องเก็บของ 10.5L 5122 JCJ IVA M ขาว,HO020102
4,1037657,ม่านEyelet 19072SOR 240X250 AUBURN PSY,FD010202
...,...,...,...
97304,1167326,แผ่นกันลื่น ELLA 27.5X27.5 น้ำตาลอ่อน ME,FD020202
97305,1167329,แปรงทำความสะอาดขวด/แก้ว MOKU,HO010409
97306,230805,ถังดับเพลิงผงเคมีแห้งIMPERIAL 2A2B 2.2LB,HW050201
97307,1167328,ตู้แช่2ระบบ HAI HCF-478C -15.2 Q ขาว,MA020603


In [6]:
new_ds = ds
new_ds['new_des'] = ds.apply(lambda x: word_tokenize(x[1],engine="newmm",keep_whitespace=False), axis=1)

In [7]:
new_ds.head(100)

Unnamed: 0,id,description,mch_id,new_des
0,285558,ลูกบิดทั่วไป VECO 3871SS-ML-ETหัวกลม SS,DH010101,"[ลูกบิด, ทั่วไป, VECO, 3871, SS-ML-ET, หัว, กล..."
1,285559,ลูกบิดห้องน้ำ VECO3871SS-ML-BKหัวกลม SS,DH010103,"[ลูกบิด, ห้องน้ำ, VECO, 3871, SS-ML-BK, หัว, ก..."
2,261589,เขาควายทั่วไป SOLEX 1615 SN/R SS,DH010401,"[เขา, ควาย, ทั่วไป, SOLEX, 1615, SN, /, R, SS]"
3,1013621,กล่องเก็บของ 10.5L 5122 JCJ IVA M ขาว,HO020102,"[กล่อง, เก็บ, ของ, 10.5, L, 5122, JCJ, IVA, M,..."
4,1037657,ม่านEyelet 19072SOR 240X250 AUBURN PSY,FD010202,"[ม่าน, Eyelet, 19072, SOR, 240, X, 250, AUBURN..."
...,...,...,...,...
95,1145499,หมวกนิรภัย มอก. GAGE HDPE 2332 YE,HW050104,"[หมวกนิรภัย, มอก., GAGE, HDPE, 2332, YE]"
96,1170470,สีน้ำทาภายใน AIR CARE BASE B SG 1GL,PA010102,"[สีน้ำ, ทา, ภายใน, AIR, CARE, BASE, B, SG, 1, GL]"
97,1170471,ตู้เย็น2D SAM RB33T3070AP/ST 12Q เปลี่ยน,MA020204,"[ตู้เย็น, 2, D, SAM, RB, 33, T, 3070, AP, /, S..."
98,1170473,หน้าบาน ตย.2D SAM BeSpoke ด้านล่าง เทา,MA020903,"[หน้าบาน, ตย., 2, D, SAM, BeSpoke, ด้านล่าง, เทา]"


# Train Test Split

In [8]:
train_df, test_df = train_test_split(new_ds, 
                    test_size=0.20,
                    random_state=123)

print('# interactions on Train set: %d' % len(train_df))
print('# interactions on Test set: %d' % len(test_df))

# interactions on Train set: 77847
# interactions on Test set: 19462


In [9]:
def process_text(text):
    #pre rules
    res = text
    return res

# Count Vectorizer

In [10]:
cv = CountVectorizer(tokenizer=process_text, lowercase=False)
cv_matrix = cv.fit_transform(new_ds['new_des'])

In [11]:
cv_matrix

<97309x37451 sparse matrix of type '<class 'numpy.int64'>'
	with 864994 stored elements in Compressed Sparse Row format>

### Find similarity using Linear Distance

In [12]:
cosine_similarities = cosine_similarity(cv_matrix, cv_matrix)

In [13]:
cosine_similarities

array([[1.        , 0.75      , 0.23570226, ..., 0.        , 0.        ,
        0.        ],
       [0.75      , 1.        , 0.11785113, ..., 0.        , 0.        ,
        0.        ],
       [0.23570226, 0.11785113, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.1490712 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.1490712 , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

### Show result

#### Helper function to use Content-based recommendation System

In [14]:
# function that takes in product as input and returns the top 10 recommended product
def recommendations(nid, cosine_similarities = cosine_similarities):
    
    recommended_products = []
    recommended_descriptions = []
    
    # gettin the index of the movie that matches the title
    try:
        idx = new_ds[new_ds.id == nid].index[0]
    except:
        print("Error : id is not found")
        return

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    print()
    # getting the indexes of the 10 most similar product
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_products.append(list(new_ds.index)[i])
        recommended_descriptions.append(list(new_ds.description)[i])
        
    print("Recommending 10 products similar to " 
          + new_ds[new_ds.id == nid].description + "...")
    print("-------")
    j = 0
    for rec in recommended_descriptions:
        print("Recommended: " + rec + " (" + str(score_series.iloc[j,]) +")")
        j = j + 1        

### Recommend

In [15]:
recommendations(1640)


52963    Recommending 10 products similar to ตลับเมตร  ...
Name: description, dtype: object
-------
Recommended: ตลับเมตร  STANLEY POWERLOCK 5M (1.0000000000000002)
Recommended: ตลับเมตร  STANLEY POWERLOCK 8M (0.8333333333333336)
Recommended: ตลับเมตร  KDS  3M (0.8333333333333336)
Recommended: ตลับเมตร หุ้มยาง STANLEY TYLON 3M (0.7302967433402215)
Recommended: ตลับเมตร พลาสติก STANLEY GLOBAL TAPE 3M (0.7216878364870323)
Recommended: ตลับเมตร STANLEY GLOBAL TAPE 5M TECH3 (0.7216878364870323)
Recommended: ตลับเมตร STANLEY GLOBALTAPE 5M BARCELONA (0.6804138174397717)
Recommended: ตลับเมตร หุ้มยาง STANLEY TYLON 8M (0.6172133998483676)
Recommended: ตลับเมตร พลาสติก STANLEY GLOBAL TAPE 5M (0.5773502691896258)
Recommended: ตลับเมตรKOMELON SELFLOCK 3Mล็อคอัตโนมัติ (0.5773502691896258)


In [16]:
recommendations(1013621)


3    Recommending 10 products similar to กล่องเก็บข...
Name: description, dtype: object
-------
Recommended: กล่องเก็บของ 20.5L 5123 JCJ IVA L ขาว (0.9999999999999999)
Recommended: กล่องเก็บของ 5L 5121 JCJ IVA S ขาว (0.7302967433402215)
Recommended: กล่องเก็บของ #3020M M STACKO ขาว (0.7)
Recommended: กล่องเก็บของ 16.5L STACKO FAMILI ขาว (0.5720775535473555)
Recommended: กล่องเก็บของ 70.5L STACKO FAMILI ขาว (0.5590169943749475)
Recommended: กล่องเก็บของ 35.5L 2555 STACKO  ขาว (0.5590169943749475)
Recommended: กล่องเก็บของ 66L 2556 STACKO  ขาว (0.5590169943749475)
Recommended: กล่องเก็บของ 30L 1403 G-WARE ขาว (0.5590169943749475)
Recommended: กล่องเก็บของ 100L 2007 API ขาว (0.5590169943749475)
Recommended: กล่องเก็บของ 40L 1402 G-WARE ขาว (0.5590169943749475)


In [17]:
recommendations(1013674)


202    Recommending 10 products similar to แก้วน้ำทรง...
Name: description, dtype: object
-------
Recommended: แก้วน้ำทรงสูง 12oz GURALLA DIMOND (0.9999999999999997)
Recommended: แก้วน้ำทรงสูง 10oz GURALLA ARAS (0.8571428571428569)
Recommended: แก้วน้ำทรงสูง 13oz GURALLA ADORA (0.8571428571428569)
Recommended: แก้วน้ำ 10oz GURALLA ARAS (0.7142857142857141)
Recommended: แก้วน้ำทรงสูงPP 17oz BIGONE FLOWER แพ็ค3 (0.50709255283711)
Recommended: แก้วก้าน 19oz GURALLA ANGELINA (0.47809144373375745)
Recommended: แก้วก้าน 10oz GURALLA RENA (0.4629100498862757)
Recommended: แก้วก้าน 7oz GURALLA RENA (0.4629100498862757)
Recommended: แก้วก้าน 9oz GURALLA LAL (0.4629100498862757)
Recommended: แก้วเบียร์ 12oz แพ็ค 4 ใบ OCEAN PILSNER (0.4629100498862757)


## Evaluation

In [18]:
interac_df = pd.read_csv('Database/orderitems_orders.csv').sort_values("lastupdate")
interac_df.head()

Unnamed: 0,partnumber,user_id,status,createdate,lastupdate
999999,1081232,9000000712031,P,2020-12-03 10:39:24,2020-12-03 11:16:26
999998,1155625,9000000712031,P,2020-12-03 10:39:24,2020-12-03 11:16:26
999997,200000,9000000213338,C,2020-12-03 11:11:45,2020-12-03 11:16:39
999996,1117607,9000000213338,C,2020-12-03 11:11:45,2020-12-03 11:16:41
999995,200000,9000000047908,C,2020-12-03 11:16:23,2020-12-03 11:18:02


In [19]:
interac_df = interac_df[interac_df["status"] == "P"]

In [20]:
users = interac_df["user_id"].unique()

In [21]:
def recommendations(nid, k=10, cosine_similarities = cosine_similarities):
    
    recommended_products = []
    recommended_descriptions = []
    
    # gettin the index of the movie that matches the title
    try:
        idx = new_ds[new_ds.id == nid].index[0]
    except:
        print("Error : id is not found")
        return

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar product
    top_10_indexes = list(score_series.iloc[1: k+1].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_products.append(list(new_ds.index)[i])
        recommended_descriptions.append(list(new_ds.id)[i])
    return recommended_descriptions 

In [22]:
def lags_feature(df, steps=5):
    res_df, columns = [], []
    
    for step in range(1, steps + 1):
        res_df.append(df.shift(step))
        for col in df.columns:
            columns.append(f"{col}(t-{step})")

    res_df = pd.concat(res_df, axis=1)
    res_df.columns = columns

#     res_df.fillna(method='bfill', inplace=True)
#     res_df.fillna(method='ffill', inplace=True)
    
    return res_df

In [23]:
recommendations(1640)

[1652,
 138846,
 18856,
 218906,
 1029086,
 1101205,
 1046513,
 218907,
 1029087,
 218476]

In [24]:
import numpy as np
from tqdm import tqdm

precision_k = []
k = 20

for user in tqdm(users[:10]):
    user_df = interac_df.loc[interac_df["user_id"] == user]
    for i in range(user_df.shape[0]):
        row = user_df.iloc[i]
        expect = user_df.iloc[i: i+k]["partnumber"].values
        if ~np.isnan(row["partnumber"]):
            context = row["partnumber"]
            pred = recommendations(context, k)
            n_match = len(set(pred).intersection(set(expect)))
            precision_k.append(n_match / expect.shape[0])

100%|██████████| 10/10 [00:19<00:00,  1.92s/it]


In [None]:
import numpy as np
from tqdm import tqdm

accuracy = []
k = 20
data = {"partnumber": [], "recommends": []}

for user in tqdm(users):
    user_df = interac_df.loc[interac_df["user_id"] == user]
    for i in range(user_df.shape[0]):
        row = user_df.iloc[i]
        expect = user_df.iloc[i: i+k]["partnumber"].values
        if ~np.isnan(row["partnumber"]):
            context = row["partnuxmber"]
            pred = recommendations(context, k)
            n_match = len(set(pred).intersection(set(expect))) > 0
            accuracy.append(n_match)
            data["partnumber"].append(context)
            data["recommends"].append(pred)

  2%|▏         | 2734/156147 [1:08:42<130:57:59,  3.07s/it] 

In [None]:
pd.DataFrame(data)

In [None]:
np.mean(accuracy)

In [None]:
recommend_df.to_csv("Export/Rec_Content_based.csv", index = False, header=True)