In [1]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import joblib 
nltk.download('wordnet')

file_name = {
    'business':'yelp_academic_dataset_business.json',
    'check_in':'yelp_academic_dataset_checkin.json',
    'review':'yelp_academic_dataset_review.json',
    'tip':'yelp_academic_dataset_tip.json',
    'user':'yelp_academic_dataset_user.json'
} #แปลงชื่อไฟล์เป็น dictionary จะได้เรียกใช้ง่ายๆ

lemmatizer = WordNetLemmatizer() # นำเข้าฟังก์ชันสำหรับทำ lemmatization
def remove_special_characters(text):
    # ใช้ regular expression เพื่อลบสัญลักษณะที่ไม่ใช่ตัวอักษร
    cleaned_text = re.sub(r'[^a-zA-Z0-9ก-๙\s]', '', text)
    cleaned_text = re.sub(r'[&\'(),]', '', cleaned_text)
    return cleaned_text

def lemmatize_tokenize(text):# สำหรับทำ lemmatize และ tokenize
    words = remove_special_characters(text)
    words = word_tokenize(words)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

def time_format(time) : # ทำให้ เวลามี format เป็น HH:mm
    temp = time.split(':')
    if len(temp[0]) == 1 :
        temp[0] = '0'+temp[0]
    if len(temp[1]) == 1 :
        temp[1] = temp[1]+'0'
    return temp[0]+':'+temp[1]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User01\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preparation

In [2]:
business = pd.read_json('dataset/'+file_name['business'],encoding='utf8', lines=True) # อ่านไฟล์ + encode
business = business.dropna(subset=['categories', 'hours']) #drop 'categories', 'hours' ที่เป็นค่าว่าง 
business = business[business['is_open']!=0].reset_index(drop=True) # drop is_open == 0 (ร้านปิดไปแล้ว)
business =  business.drop(['postal_code','attributes','is_open'],axis=1) # drop columns ที่ไม่ใช้
business['categories'] = business['categories'].str.lower() #ปรับเป้นพิมพ์เล็ก
business = business[business['categories'].str.contains('restaurants') | business['categories'].str.contains('food')].reset_index(drop=True) # คัดให้เหลือแค่เกี่ยวกับอาหาร

day =  ['Monday','Tuesday', 'Wednesday', 'Thursday',  'Friday','Saturday', 'Sunday']
day_dic = {
    'Monday':[], 
    'Tuesday':[], 
    'Wednesday':[], 
    'Thursday':[], 
    'Friday':[], 
    'Saturday':[], 
    'Sunday':[]
} # แยกคอลัมภ์ เวลาเปิดเป็นแบบรายวัน

for i in range(0,len(business)) :
    temp = business['hours'].iloc[i]
    for k in list(day_dic.keys()) :
        if k in list(temp.keys()) :
            temp_2 = temp[k].split('-')
            day_dic[k].append(time_format(temp_2[0])+'-'+time_format(temp_2[1]))
        else :
            day_dic[k].append('-')
for k in day_dic :
    business[k] = day_dic[k]
business = business.drop(['hours'],axis=1)
# business.to_excel('processing_data.xlsx',index=False)

# Pre-processing
ด้านล่างจะเป็นส่วนของการทำ tfidf ของส่วนที่ user จะ query มา

In [78]:
city_tf_idf = TfidfVectorizer(
    tokenizer=lemmatize_tokenize
)
city_features = city_tf_idf.fit_transform(business['city']).toarray()

state_tf_idf = TfidfVectorizer(
    tokenizer=lemmatize_tokenize
)
state_features = state_tf_idf.fit_transform(business['state']).toarray()

name_tf_idf = TfidfVectorizer(
    tokenizer=lemmatize_tokenize
)
name_features = name_tf_idf.fit_transform(business['name']).toarray()

categories_tf_idf = TfidfVectorizer(
    tokenizer=lemmatize_tokenize
)
categories_features = categories_tf_idf.fit_transform(business['categories']).toarray()

# สำหรับทำ user preferrence
# category_list = list(categories_tf_idf.get_feature_names_out())
# user_preference = pd.DataFrame(columns = ['user']+category_list,data = [ ['test']+list(np.zeros(len(category_list)))])




In [4]:
user_preference

Unnamed: 0,user,acai,accessory,accountant,active,activity,acupuncture,adoption,adult,advertising,...,wine,winery,wing,woman,wrap,yelp,yoga,yogurt,your,zoo
0,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
city_tf_idf = joblib.load('models/city_tf_idf.joblib')
state_tf_idf = joblib.load('models/state_tf_idf.joblib')
name_tf_idf = joblib.load('models/name_tf_idf.joblib')
categories_tf_idf = joblib.load('models/categories_tf_idf.joblib')

city_features = city_tf_idf.transform(business['city']).toarray()
state_features = state_tf_idf.transform(business['state']).toarray()
name_features = name_tf_idf.transform(business['name']).toarray()
categories_features = categories_tf_idf.transform(business['categories']).toarray()

In [99]:
test = joblib.load('models/city_tf_idf.joblib')
test

In [79]:
text ='''name : <name>
address : <address>
city : <city>
state : <state>
categories : <categories>
review count : <review_count>
stars : <stars>
latitude : <latitude>, longitude: <longitude>
open : <time>
-----------------------------------------------
'''
columns_print =['name','address','city','state','review_count','latitude','longitude','stars','categories']
data_sent = '' 
while True :
    query_test = input("Query : ") #query
    if query_test == 'stop' :
        break
    #หา cosine similarity
    name_result = cosine_similarity(name_tf_idf.transform([query_test]),name_features)
    city_result = cosine_similarity(city_tf_idf.transform([query_test]),city_features)
    state_result = cosine_similarity(state_tf_idf.transform([query_test]),state_features)
    time = compute_time(query_test)
    categories_result = cosine_similarity(categories_tf_idf.transform([query_test]),categories_features)

    result_cosine = name_result[0]+city_result[0]+state_result[0]+categories_result[0]# นำผลลัพธ์ค่า cosine ในแต่ละอันมาบวกกัน
    
    result_business = business.copy()
    result_business['cosine_similarity'] = result_cosine # นำค่า cosine มาใส่ column cosine_similarity ที่เพิ่ม
    data_sent = result_business[result_business['cosine_similarity']>=0.2].sort_values(['cosine_similarity'],ascending=False).reset_index(drop=True).copy() #นำมาเรียงและสนใจเฉพาะที่ค่าความใกล้มากกว่าเท่ากับ 1
    for i in range(0,10) :
        temp_text = text
        for column in columns_print :
            temp_columns = '<'+column+'>'
            temp_text = temp_text.replace(temp_columns,str(data_sent[column].iloc[i]))
        time = ""
        for d in day :
            if result_business[d].iloc[i] != '-' :
                time += '\n'+d+' : '+ result_business[d].iloc[i]
        
        temp_text = temp_text.replace('<time>',time)       
        print(temp_text)#แสดงผลลัพธ์
    
#     prefer = lemmatize_tokenize(query_test)
#     for c in prefer :
#         if c in category_list :
#             user_preference[c].iloc[0] +=1 # บึนทึก user preferrence

# result_prefer = user_preference.copy().T.reset_index()
# result_prefer.columns = ['catagory','count']
# result_prefer = result_prefer.sort_values(['count'],ascending=False).reset_index(drop=True)
# result_prefer = result_prefer[result_prefer['count']!=0].iloc[0:10]

# plt.bar(result_prefer['catagory'],result_prefer['count']) # แสดงกราฟที่บันทึกไว้
# plt.show()

Query : I am a solo traveler looking for seafood spot with a waterfront view. I can afford expensive dishes.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_preference[c].iloc[0] +=1 # บึนทึก user preferrence


name : C & A Seafood
address : 1429 S Norman C Francis Pkwy
city : New Orleans
state : LA
categories : restaurants, seafood
review count : 65
stars : 3.5
latitude : 29.9564929899, longitude: -90.1059199196
open : 
Monday : 07:00-20:00
Tuesday : 07:00-20:00
Wednesday : 07:00-20:00
Thursday : 07:00-20:00
Friday : 07:00-21:00
Saturday : 07:00-21:00
Sunday : 07:00-21:00
-----------------------------------------------

name : I Got Crabs N Some
address : 150 S Delsea Dr
city : Glassboro
state : NJ
categories : seafood, restaurants
review count : 7
stars : 3.5
latitude : 39.6900948339, longitude: -75.1011052553
open : 
Wednesday : 14:00-22:00
Thursday : 16:00-22:00
Friday : 12:00-22:00
Saturday : 12:00-22:00
Sunday : 12:00-18:00
-----------------------------------------------

name : Chris' Pizza Village Pleasant View
address : 244 Village Sq Pleasent, Ste 100
city : View
state : TN
categories : chicken wings, italian, salad, pizza, restaurants
review count : 31
stars : 4.0
latitude : 36.387

KeyboardInterrupt: Interrupted by user

# ด้านล่างใช้สำหรับบันทึกโมเดล TF-IDF

In [118]:
# import pickle
# pickle.dump(city_tf_idf.vocabulary_,open("city_tf_idf.pkl","wb"))
# pickle.dump(state_tf_idf.vocabulary_,open("state_tf_idf.pkl","wb"))
# pickle.dump(name_tf_idf.vocabulary_,open("name_tf_idf.pkl","wb"))
# pickle.dump(categories_tf_idf.vocabulary_,open("categories_tf_idf.pkl","wb"))

In [119]:
# city_tf_idf = TfidfVectorizer(
#     tokenizer=lemmatize_tokenize,
#     vocabulary=pickle.load(open("city_tf_idf.pkl", "rb"))
# )
# state_tf_idf = TfidfVectorizer(
#     tokenizer=lemmatize_tokenize,
#     vocabulary=pickle.load(open("state_tf_idf.pkl", "rb"))
# )
# name_tf_idf = TfidfVectorizer(
#     tokenizer=lemmatize_tokenize,
#     vocabulary=pickle.load(open("name_tf_idf.pkl", "rb"))
# )
# categories_tf_idf = TfidfVectorizer(
#     tokenizer=lemmatize_tokenize,
#     vocabulary=pickle.load(open("categories_tf_idf.pkl", "rb"))
# )