In [1]:
%load_ext autoreload
%autoreload 2
from project_transformer import Data_Wrangling

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from underthesea import word_tokenize, pos_tag, sent_tokenize
import warnings
from gensim import corpora, models, similarities
import gensim
import re

In [3]:
import matplotlib.pyplot as plt 
import seaborn as sns
import glob
import os 
from tqdm.auto import tqdm
import re
import pickle

In [4]:
import scipy.sparse as ss

# Function:

In [5]:
# EMOJI
with open('../DATA/files/emojicon.txt', 'r', encoding="utf8") as file:
    emoji_lst = file.read().split('\n')
    emoji_dict = {}
    for line in emoji_lst:
        key, value = line.split('\t')
        emoji_dict[key] = str(value)

# TEEN CODE
with open('../DATA/files/teencode.txt', 'r', encoding="utf8") as file:
    teen_lst = file.read().split('\n')
    teen_dict = {}
    for line in teen_lst:
        key, value = line.split('\t')
        teen_dict[key] = str(value)

# ENG VIET
with open('../DATA/files/english-vnmese.txt', 'r', encoding="utf8") as file:
    eng_lst = file.read().split('\n')
    eng_dict = {}
    for line in eng_lst:
        key, value = line.split('\t')
        eng_dict[key] = str(value)

# WRONG WORD
with open('../DATA/files/wrong-word.txt', 'r', encoding="utf8") as file:
    wrong_lst = file.read().split('\n')

# STOP WORD
with open('../DATA/files/vietnamese-stopwords.txt', 'r', encoding="utf8") as file:
    stop_lst = file.read().split('\n')

## Build CLEANSER

In [6]:
data_cleanser = Data_Wrangling(emoji_dict=emoji_dict,
                               teen_dict=teen_dict,
                               wrong_lst=wrong_lst,
                               eng_vn_dict=eng_dict,
                               stop_words=stop_lst)

In [7]:
type(data_cleanser)

project_transformer.Data_Wrangling

### Gensim

In [8]:
def recommender_gensim(view_prd: str,
                       cleaner,
                       dict_prd: corpora.Dictionary,
                       tfidf_model: gensim.models.tfidfmodel.TfidfModel,
                       similarity: gensim.similarities.docsim.SparseMatrixSimilarity,
                       prd_df: pd.DataFrame):
    # Convert search words into Sparse Vectors
    clean_txt = cleaner.process_text(text=view_prd)
    clean_txt = cleaner.process_postag_thesea(clean_txt)

    clean_ls = clean_txt.split()
    kw_vector = dict_prd.doc2bow(clean_ls)
    print("View product's vector:")
    print(kw_vector)

    # similarity calculation
    sim = similarity[tfidf_model[kw_vector]]

    # print result
    list_id = []
    list_score = []
    for i in range(len(sim)):
        list_id.append(i)
        list_score.append(sim[i])

    # Create df
    df_res = pd.DataFrame({'item_id': list_id, 'score': list_score})

    # Get 5 highest score
    five_h_score = df_res.sort_values(by='score', ascending=False).head(6)
    print("Five highest scores:")
    print(five_h_score)
    print('IDS to list:')
    id_to_ls = five_h_score['item_id'].to_list()
    print(id_to_ls)

    # Find prd
    prd_find = prd_df[prd_df.index.isin(id_to_ls)]
    prd_results = prd_find[['product_id', 'product_name', 'clean_prd_name', 'sub_category', 'price', 'rating']]
    final = pd.concat([prd_results, five_h_score], axis=1).sort_values(by='score', ascending=False)
    final = final[final['score']!=1]
    return final

### Similarity:

In [9]:
def recommend_cosine(text_,
                     df,
                     cleaner,
                     tf_idf_model,
                     tf_idf_arr,
                     num=5):
    import warnings
    warnings.filterwarnings("ignore")
    # Convert search words into Sparse Vectors
    clean_txt = cleaner.process_text(text=text_)
    clean_txt = cleaner.process_postag_thesea(clean_txt)

    # Get Sparse Matrix from TFIDF
    txt_sparse_matrix = tf_idf_model.transform([clean_txt])

    # Calcualte the cosine score
    cosine_text_ = cosine_similarity(X=txt_sparse_matrix,
                                     Y=tf_idf_arr)

    top5_prd = cosine_text_.flatten().argsort()[-num-1:-1]
    top5_score = [(cosine_text_.flatten()[i]) for i in top5_prd]

    # Extract Product
    ls_item = []
    for id, sim in zip(top5_prd, top5_score):
        sim_item = {'item_id': id, 'score': sim}
        ls_item.append(sim_item)
    print(f"San Pham user nhap: {text_}\n\n")
    for rec in ls_item:
        # print(rec[1])
        print(
            f"Recommnended:\tItem ID: {rec['item_id']}, {df.loc[df['item_id'] == rec['item_id'], 'product_name'].values[0]}, (score: {rec['score']})\n")

    # Create DataFrame
    df_res = pd.DataFrame(ls_item)
    filter_ls = df_res['item_id'].to_list()
    prd_find = df[df['item_id'].isin(filter_ls)]
    prd_results = pd.merge(left=prd_find,
                           right=df_res,
                           on=['item_id'],
                           how='inner')
    
    final = prd_results.sort_values(by='score', ascending=False)
    #final = final[final['score']!=1]
    feat = ['product_id', 'product_name', 'clean_prd_name', 'sub_category', 'price', 'rating','item_id','score']
    return final[feat]

In [10]:
def combine_both(gensim_df, cosim_df):
    combine_df = pd.merge(cosim_df,
                     gensim_df,
                     on=['product_id','product_name','clean_prd_name','sub_category','price','rating'],
                     suffixes=('_gensim', '_simi'), 
                     how='outer').drop(columns=['item_id_gensim','item_id_simi'])
    combine_df[combine_df['rating']>3]
    return combine_df

# Load Product DF:

In [11]:
use_cols = ['item_id','product_id','sub_category','price','rating','clean_desc','product_name','clean_prd_name']
clean_prd_df = pd.read_csv('../DATA/final_details.csv',usecols=use_cols)
clean_prd_df.head()

Unnamed: 0,item_id,product_id,product_name,sub_category,price,rating,clean_desc,clean_prd_name
0,0,190,"Áo ba lỗ thun gân ,form body tôn dáng",Áo Ba Lỗ,86250.0,4.9,áo lỗ chiều đường_phố nhiệt_đới tal fit xuất_x...,áo lỗ thun gân form body
1,1,191,"Áo Ba Lỗ Nam Trắng Chất Cotton Siêu Mát, Siêu Đẹp",Áo Ba Lỗ,26800.0,4.9,áo lỗ xuất_xứ việt_nam tổ_chức trách_nhiệm sản...,áo lỗ nam trắng chất_cotton siêu_mát đẹp
2,2,192,"Áo Ba Lỗ Nam Tyasuo chất vải co dãn mát, không...",Áo Ba Lỗ,39500.0,4.8,áo lỗ thương_hiệu tyasuo chiều áo không_thể đư...,áo lỗ nam tyasuo chất vải co_dãn mát không_xù mềm
3,3,193,ÁO BA LỖ HÀNG VIỆT NAM 100% COTTON,Áo Ba Lỗ,16500.0,4.8,áo lỗ chất_liệu hàng gửi hà_nội áo lỗ nam mặc ...,áo lỗ hàng việt_nam
4,4,194,Áo Thun Nam Thể Thao Ba Lỗ Mẫu Mới Siêu Đẹp (B...,Áo Ba Lỗ,45000.0,4.8,áo lỗ chiều áo không_thể hàng gửi hà_nội thông...,áo thun nam thể_thao lỗ mẫu mới đẹp


# Gensim

## Load Model and Product Dataframe:

In [78]:
# Load the TF-IDF model from the file
loaded_tfidf_model = models.TfidfModel.load('../DATA/Gensim/tfidf_model')

# Load the dictionary from the file
loaded_dictionary = corpora.Dictionary.load('../DATA/Gensim/corpus_dictionary')

# Load the similarity matrx 
loaded_similarity_matrix  = similarities.SparseMatrixSimilarity.load("../DATA/Gensim/similarity_matrix.index")

## Test Fuction:

In [79]:
#clean_prd_df

In [15]:
text_prd = clean_prd_df.loc[100,'product_name']
text_prd = 'kinh nam rất xịn'
text_prd

'kinh nam rất xịn'

In [145]:
results_df = recommender_gensim(view_prd=text_prd,
                                cleaner=data_cleanser,
                                dict_prd=loaded_dictionary,
                                tfidf_model=loaded_tfidf_model,
                                similarity=loaded_similarity_matrix,
                                prd_df=clean_prd_df)

View product's vector:
[(7, 1), (526, 1), (1994, 1), (6595, 1)]
Five highest scores:
       item_id     score
28351    28351  0.518623
30872    30872  0.443889
19026    19026  0.378886
31381    31381  0.366443
33358    33358  0.366443
11035    11035  0.351456
IDS to list:
[28351, 30872, 19026, 31381, 33358, 11035]


In [146]:
results_df

Unnamed: 0,product_id,product_name,clean_prd_name,sub_category,price,rating,item_id,score
28351,113402,kinh nam siêu xin,kinh nam xin,Khác,145000.0,0.0,28351,0.518623
30872,12486,kinh mát nam cao cấp gọng dẻo đẹp nhất,kinh mát_nam cao_cấp gọng dẻo đẹp nhất,Kính Mắt Nam,209000.0,4.5,30872,0.443889
19026,241905,Kinh nam Poscher đổi màu chui tròn,kinh nam poscher đổi màu chui tròn,Đồ Hóa Trang,180000.0,0.0,19026,0.378886
31381,12995,Kính nam xịn,kính nam xịn,Kính Mắt Nam,250000.0,5.0,31381,0.366443
33358,122972,KÍNH NAM XỊN,kính nam xịn,Kính Mắt Nam,550000.0,0.0,33358,0.366443
11035,103621,Cà Vạt Có Khóa Kéo Dễ Kéo Phong Cách Bắc Kinh ...,cà_vạt có khóa kéo dễ kéo phong_cách bắc kinh ...,Cà vạt & Nơ cổ,133000.0,0.0,11035,0.351456


# Similarity

## Load Model and Product Dataframe:

In [12]:
with open('../DATA/tfidf_vectorizer.pkl', 'rb') as f:
    sim_tfidf_model = pickle.load(f)
type(sim_tfidf_model)

sklearn.feature_extraction.text.TfidfVectorizer

In [13]:
sim_tfidf_matrix = ss.load_npz('../DATA/tfidf_matrix.npz')
type(sim_tfidf_matrix)

scipy.sparse._csr.csr_matrix

## Test Fuction:

In [85]:
#clean_prd_df

In [147]:
#text_prd = clean_prd_df.loc[100,'product_name']
text_prd

'kinh nam rất xịn'

In [16]:
sim_res_df = recommend_cosine(text_=text_prd,
                              cleaner=data_cleanser,
                             df=clean_prd_df,
                             tf_idf_model=sim_tfidf_model,
                             tf_idf_arr=sim_tfidf_matrix)

San Pham user nhap: kinh nam rất xịn


Recommnended:	Item ID: 11035, Cà Vạt Có Khóa Kéo Dễ Kéo Phong Cách Bắc Kinh Dành Cho Nam Nữ, (score: 0.43099371373718015)

Recommnended:	Item ID: 19026, Kinh nam Poscher đổi màu chui tròn, (score: 0.4424089801609495)

Recommnended:	Item ID: 33358, KÍNH NAM XỊN, (score: 0.46969046860575886)

Recommnended:	Item ID: 31381, Kính nam xịn, (score: 0.46969046860575886)

Recommnended:	Item ID: 30872, kinh mát nam cao cấp gọng dẻo đẹp nhất, (score: 0.5188618900050489)



In [149]:
sim_res_df

Unnamed: 0,product_id,product_name,clean_prd_name,sub_category,price,rating,item_id,score
2,12486,kinh mát nam cao cấp gọng dẻo đẹp nhất,kinh mát_nam cao_cấp gọng dẻo đẹp nhất,Kính Mắt Nam,209000.0,4.5,30872,0.518862
3,12995,Kính nam xịn,kính nam xịn,Kính Mắt Nam,250000.0,5.0,31381,0.46969
4,122972,KÍNH NAM XỊN,kính nam xịn,Kính Mắt Nam,550000.0,0.0,33358,0.46969
1,241905,Kinh nam Poscher đổi màu chui tròn,kinh nam poscher đổi màu chui tròn,Đồ Hóa Trang,180000.0,0.0,19026,0.442409
0,103621,Cà Vạt Có Khóa Kéo Dễ Kéo Phong Cách Bắc Kinh ...,cà_vạt có khóa kéo dễ kéo phong_cách bắc kinh ...,Cà vạt & Nơ cổ,133000.0,0.0,11035,0.430994


In [150]:
results_df

Unnamed: 0,product_id,product_name,clean_prd_name,sub_category,price,rating,item_id,score
28351,113402,kinh nam siêu xin,kinh nam xin,Khác,145000.0,0.0,28351,0.518623
30872,12486,kinh mát nam cao cấp gọng dẻo đẹp nhất,kinh mát_nam cao_cấp gọng dẻo đẹp nhất,Kính Mắt Nam,209000.0,4.5,30872,0.443889
19026,241905,Kinh nam Poscher đổi màu chui tròn,kinh nam poscher đổi màu chui tròn,Đồ Hóa Trang,180000.0,0.0,19026,0.378886
31381,12995,Kính nam xịn,kính nam xịn,Kính Mắt Nam,250000.0,5.0,31381,0.366443
33358,122972,KÍNH NAM XỊN,kính nam xịn,Kính Mắt Nam,550000.0,0.0,33358,0.366443
11035,103621,Cà Vạt Có Khóa Kéo Dễ Kéo Phong Cách Bắc Kinh ...,cà_vạt có khóa kéo dễ kéo phong_cách bắc kinh ...,Cà vạt & Nơ cổ,133000.0,0.0,11035,0.351456


In [151]:
text_prd

'kinh nam rất xịn'

# Combine BOTH

In [155]:
combine_both(gensim_df=results_df,cosim_df=sim_res_df)

Unnamed: 0,product_id,product_name,clean_prd_name,sub_category,price,rating,score_gensim,score_simi
0,12486,kinh mát nam cao cấp gọng dẻo đẹp nhất,kinh mát_nam cao_cấp gọng dẻo đẹp nhất,Kính Mắt Nam,209000.0,4.5,0.518862,0.443889
1,12995,Kính nam xịn,kính nam xịn,Kính Mắt Nam,250000.0,5.0,0.46969,0.366443
2,122972,KÍNH NAM XỊN,kính nam xịn,Kính Mắt Nam,550000.0,0.0,0.46969,0.366443
3,241905,Kinh nam Poscher đổi màu chui tròn,kinh nam poscher đổi màu chui tròn,Đồ Hóa Trang,180000.0,0.0,0.442409,0.378886
4,103621,Cà Vạt Có Khóa Kéo Dễ Kéo Phong Cách Bắc Kinh ...,cà_vạt có khóa kéo dễ kéo phong_cách bắc kinh ...,Cà vạt & Nơ cổ,133000.0,0.0,0.430994,0.351456
5,113402,kinh nam siêu xin,kinh nam xin,Khác,145000.0,0.0,,0.518623


In [154]:
# combine_df = pd.merge(sim_res_df,
#                      results_df,
#                      on=['product_id','product_name','clean_prd_name','sub_category','price','rating'],
#                      suffixes=('_gensim', '_simi'), 
#                      how='outer').drop(columns=['item_id_gensim','item_id_simi'])
# combine_df[combine_df['rating']>3]