In [None]:
# importing inportant libraries
import os
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import gensim.utils
import gensim.downloader as api
from gensim.utils import simple_preprocess
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from flask import Flask, render_template, request

**讀取資料 Load File**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv(r'/content/drive/MyDrive/all_recipe.csv')
data

Mounted at /content/drive


Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"['4 skinless, boneless chicken breast halves A...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,['2 (10.75 ounce) cans condensed cream of mush...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"['1/2 cup packed brown sugar ADVERTISEMENT', '...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"['1 cup butter, softened ADVERTISEMENT', '1 cu...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,['8 ounces whole wheat rotini pasta ADVERTISEM...,Preheat oven to 350 degrees F. Line a 2-quart ...
...,...,...,...
114205,Apple Chimichangas,"['1/4 cup butter', '1/3 cup sugar', '1 teaspoo...",Watch how to make this recipe.\nHeat a heavy m...
114206,Coconut-Kaffir Leaf Poached Halibut with Saute...,"['2 cups coconut milk', '1 short stalk lemon g...",Preheat the oven to 250 degrees F\nIn a small ...
114207,Chicken Braised with 20 Cloves of Garlic,"['2 large heads garlic', '1 cut-up chicken, ab...",Bring a small saucepan of water to a boil. Sep...
114208,Cream Horns,"['1 sheet frozen puff pastry, thawed', '1 egg'...",Grease 8 cream horn metal cones. Cut the puff ...


In [None]:
# 去除雜字 "ADVERTISEMENT"
# Remove dummy words "ADVERTISEMENT"
recipe = data['title'].astype(str) + ' ' + data['ingredients'].astype(str) + ' ' + data['instructions'].astype(str)
recipe = [re.sub(r'\b\w*ADVERTISEMENT\w*\b', '', sentence) for sentence in recipe]
recipe = [''.join(words) for words in recipe]
recipe = pd.Series(recipe)
recipe

0         Slow Cooker Chicken and Dumplings ['4 skinless...
1         Awesome Slow Cooker Pot Roast ['2 (10.75 ounce...
2         Brown Sugar Meatloaf ['1/2 cup packed brown su...
3         Best Chocolate Chip Cookies ['1 cup butter, so...
4         Homemade Mac and Cheese Casserole ['8 ounces w...
                                ...                        
114205    Apple Chimichangas ['1/4 cup butter', '1/3 cup...
114206    Coconut-Kaffir Leaf Poached Halibut with Saute...
114207    Chicken Braised with 20 Cloves of Garlic ['2 l...
114208    Cream Horns ['1 sheet frozen puff pastry, thaw...
114209    Chocolate Cake with Armagnac Ice Cream ['8 oun...
Length: 114210, dtype: object

# 試試TF-IDF!

In [None]:
# 用TF-IDF把食譜轉換成代碼，並存至字詞資料庫
# Convert recipes into code using TF-IDF and save them to the word database
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', min_df=0.05, max_df=0.9, ngram_range = (1,3))
tfidf.fit(recipe)

In [None]:
# 對食譜進行 TF-IDF 轉換
# Perform TF-IDF transformation on recipes.
recipe_data = tfidf.fit_transform(recipe)

In [None]:
# 隨機提取問題資料庫中的問題，並將其放入列表中
# Allow user to input query
question = input("What is in your mind? \n")

# 對問題進行 TF-IDF 轉換
# Perform TF-IDF transformation on user's query.
X = tfidf.transform([question])

# 計算餘弦相似度
# Calculate cosine similarity
cos_X = cosine_similarity(X, recipe_data)

# 將相似度矩陣降序排序並取前三個最大值的索引
# Get the top 3 results (recipes) that have high cosine similarity
top_indices = np.argsort(cos_X[0])[-3:][::-1]

print(question)
print('---------------------------------------------------------------------------')

# 打印相似度最高的三筆資料
# Print the results
for i, index in enumerate(top_indices, 1):
    recipe_info = re.split('\[|\]', recipe[index])
    print(f"Recommendation {i} , The similarity is {cos_X[0][index]}:")
    print('*Title:')
    print(recipe_info[0],'\n')
    print('*Ingredients:')
    print(recipe_info[1],'\n')
    print('*Instructions:')
    print(recipe_info[2],'\n')
    print('\n')

What is in your mind? 
I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)?
I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)?
---------------------------------------------------------------------------
Recommendation 1 , The similarity is 0.5009983990147417:
*Title:
Soothing Hot Ginger Tea  

*Ingredients:
'1 (12 fl oz) can ginger ale (such as Canada Dry®) ', '1 black tea bag (such as Lipton®) ', '' 

*Instructions:
 Pour ginger ale into a microwave-safe mug. Heat in the microwave for 1 to 2 minutes.
Steep tea bag in the hot ginger ale for 3 to 5 minutes.
 



Recommendation 2 , The similarity is 0.49697131070897377:
*Title:
Hot Apple-Ginger Toddy  

*Ingredients:
'6 oz. ginger-infused apple cider', 'Thin slice of fresh ginger or one piece of crystallized, candied ginger', '1 tsp. honey', "2 oz. bourbon (Maker's Mark or Jim Beam are preferred)", 'Slice of lem

在TF-IDF上加SVD降維 / TFIDF + SVD

In [None]:
# 用 SVD 把資料降維度至10
# Use SVD to reduce dimension to 10
svd = TruncatedSVD(10)
R = svd.fit_transform(recipe_data) # 用TF-IDF轉換過的recipe 降維 / Apply SVD on TFIDF-transformed recipe
Q = svd.transform(X) # 用TF-IDF轉換過的qestion data 降維 / Apply SVD on TFIDF-transformed user's query

# 計算餘弦相似度
# Calculate Cosine Similarity
cos_X = cosine_similarity(Q, R)

# 將相似度矩陣降序排序並取前三個最大值的索引
# Get the top 3 results (recipes) that have high cosine similarity
top_indices = np.argsort(cos_X[0])[-3:][::-1]

print(question)
print('---------------------------------------------------------------------------')

# 打印相似度最高的三筆資料
# Print the results
for i, index in enumerate(top_indices, 1):
    recipe_info = re.split('\[|\]', recipe[index])
    print(f"Recommendation {i} , The similarity is {cos_X[0][index]}:")
    print('*Title:')
    print(recipe_info[0],'\n')
    print('*Ingredients:')
    print(recipe_info[1],'\n')
    print('*Instructions:')
    print(recipe_info[2],'\n')
    print('\n')

I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)?
---------------------------------------------------------------------------
Recommendation 1 , The similarity is 0.9460398688897547:
*Title:
Cocktail Wieners I  

*Ingredients:
'1 (16 ounce) can cranberry sauce ', '12 fluid ounces chili sauce ', '3 pounds beef cocktail wieners ', '' 

*Instructions:
 In a 4-quart saucepan over medium heat, combine cranberry sauce and chili sauce. Break the cranberry sauce into smaller pieces with wooden spoon to speed up the melting process. Stir and heat until the cranberry sauce is melted. Add the cocktail wieners and cook until the wieners are heated. Use toothpicks for serving.
 



Recommendation 2 , The similarity is 0.9429141505896389:
*Title:
Memphis Style Dry Ribs: Corky's Ribs  

*Ingredients:
'One 2- to 2 3/4-pound slab raw St. Louis cut pork spare ribs', '1 1/2 cups water (3 parts)', "1/2 cup favorite BBQ sauce (1 part), plus 

##### TF-IDF + SVD降維 的結果比較差! 所以TF-IDF的模型決定不用SVD / TFIDF + SVD shows a worse result! Thus, don't appply SVD on TFIDF here

# 試試word2vec! / Try Word2Vec

In [None]:
toks = []
for s in recipe:
  toks.append(simple_preprocess(s))

# vector_size: 這個參數指定了每個單詞向量的維度。在這個例子中，每個單詞將被表示為一個200維的向量
# window: 這個參數指定了在訓練過程中考慮的上下文窗口的大小。具體地說，它表示了在訓練過程中，每個單詞的上下文窗口可以包含的單詞數量
# min_count: 這個參數指定了訓練過程中考慮的最小詞頻。具體地說，如果一個單詞在文本數據中出現的次數少於 min_count，那麼它將被忽略不計。
# workers=4 表示使用 4 個線程來訓練模型，這樣可以利用多核處理器提高訓練效率
model = Word2Vec(toks, vector_size=400, window=5, min_count=0, workers=4)

# 提取詞向量：獲得訓練好的詞向量模型後，從模型中提取詞向量。這裡將 model.wv 中的詞向量提取出來，並保存在 word_vectors 中。
word_vectors = model.wv

# 看看sweet 被分類到甚麼同意字
# See what words are considered to be in the same vector with the word "sweet"
word_vectors.most_similar_cosmul(['sweet'])

[('idaho', 0.7747125029563904),
 ('russet', 0.7643802165985107),
 ('fingerling', 0.7537190914154053),
 ('waxy', 0.7322282195091248),
 ('lovin', 0.7312313318252563),
 ('brien', 0.7310512065887451),
 ('shoestring', 0.7225973606109619),
 ('hasselback', 0.7218263745307922),
 ('scalloped', 0.7189490795135498),
 ('flanagan', 0.7174539566040039)]

## Note:
* 在計算TF-IDF（Term Frequency-Inverse Document Frequency）的餘弦相似度時，不需要考慮向量的大小:
是因為TF-IDF的值是以每個文檔中每個詞的相對重要性為基礎計算的。當計算TF-IDF向量時，向量的大小（即向量的長度）已經被標準化，因此餘弦相似度計算僅僅考慮向量之間的角度。

* 而在Word2Vec中，每個詞的詞向量表示其在向量空間中的位置，而這些向量的大小（即向量的長度）通常也是重要的。
由於Word2Vec中的詞向量通常沒有被標準化，因此其大小（即向量的長度）可能會影響餘弦相似度的計算。為了確保比較的公平性，通常在計算Word2Vec向量的餘弦相似度時，會對詞向量進行正規化，以使它們具有單位長度。

In [None]:
# question = input("What is in your mind?")
# 將question轉成單詞列表
# Convert the question into a list of words.
question_tokens = simple_preprocess(question)
print(question_tokens)

# 獲得question的向量表示
# Obtain the vector representation of the question
question_vec = word_vectors[question_tokens]
print('question_vec_length:',len(question_vec))

# 計算question_vec的平均詞向量，以利之後比較cosine similarity
# Calculate the average word vector of the question_vec for better comparison of cosine similarity later.
question_avg_vec = np.mean([word_vectors[token] for token in question_tokens if token in word_vectors], axis=0)
# 檢查是否有 NaN 值，如果有的話，則將其替換為 0
# Replace those null values
question_avg_vec = np.nan_to_num(question_avg_vec)
# 正規化向量（可選）
# Normalize the vector (optional).
question_avg_vec /= np.linalg.norm(question_avg_vec)

['have', 'tofu', 'and', 'soy', 'sauce', 'and', 'ginger', 'can', 'you', 'recommend', 'savory', 'dish', 'that', 'can', 'be', 'cooked', 'in', 'medium', 'minutes']
question_vec_length: 19


In [None]:
# 對於question資料庫中的每一個問題，找到最相近的三個食譜
print(question)
print('---------------------------------------------------------------------------')

# recipe = data['title'].astype(str) + ' ' + data['ingredients'].astype(str) + ' ' + data['instructions'].astype(str)

recipe_similarity = []
for content in recipe:
    content_tokens = simple_preprocess(content)
    #content_vec = word_vectors[content_tokens]
    # 計算平均詞向量
    recipe_avg_vec = np.mean([word_vectors[token] for token in content_tokens if token in word_vectors], axis=0)
    # 檢查是否有 NaN 值，如果有的話，則將其替換為 0
    recipe_avg_vec = np.nan_to_num(recipe_avg_vec)
    # 正規化向量（可選）
    recipe_avg_vec /= np.linalg.norm(recipe_avg_vec)
    recipe_similarity.append(np.dot(question_avg_vec, recipe_avg_vec))

# 抓相似度前三的食譜index以及similarity
# Get the top 3 results (recipes) that have high similarity
top3_indices = np.argsort(recipe_similarity)[::-1][:3]

for i, index in enumerate(top3_indices, 1):
    recipe_info = re.split('\[|\]', recipe[index])
    print(f"Recommendation {i} , The similarity is {recipe_similarity[index]}:")
    print('*Title:')
    print(recipe_info[0],'\n')
    print('*Ingredients:')
    print(recipe_info[1],'\n')
    print('*Instructions:')
    print(recipe_info[2],'\n')
    print('\n')

I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)?
---------------------------------------------------------------------------
Recommendation 1 , The similarity is 0.5385329127311707:
*Title:
African Ground Nut Stew  

*Ingredients:
'1 onion, diced', '2 to 3 pounds boneless chicken cut into chunks (I prefer thigh meat)', '1/2 jar natural peanut butter (no sugar or stabilizers added)', '1 can coconut milk (often in Asian grocery section)', '2 quart chicken stock, canned or homemade', '1 large bunch collard or other greens, chopped fairly finely and after removing center ribs (frozen, drained greens can be used as a substitute)', 'Sambal oelek, to taste (hot chili paste from Asian grocery section)', 'Cooked rice, as an accompaniment' 

*Instructions:
 Saute onions until translucent in a large saucepan. Add chicken pieces and saute until golden but not necessarily cooked through. Add coconut milk and stock and bring to a sim

In [None]:
# 進行 SVD 降維至10
# Apply SVD on Word2Vec
svd = TruncatedSVD(10)
svd.fit(word_vectors.vectors)
word_vectors_svd = svd.transform(word_vectors.vectors)

# 計算 question_vec 的平均詞向量，以利之後比較 cosine similarity
# Calculate the similarity score
question_avg_vec = np.mean([word_vectors_svd[word_vectors.index_to_key.index(token)] for token in question_tokens if token in word_vectors], axis=0)
# 檢查是否有 NaN 值，如果有的話，則將其替換為 0
question_avg_vec = np.nan_to_num(question_avg_vec)
# 正規化向量（可選）
question_avg_vec /= np.linalg.norm(question_avg_vec)

# 對於 question 資料庫中的每一個問題，找到最相近的三個食譜
print(question)
print('---------------------------------------------------------------------------')

recipe_similarity = []
for content in recipe:
    content_tokens = simple_preprocess(content)
    # 計算平均詞向量
    recipe_avg_vec = np.mean([word_vectors_svd[word_vectors.index_to_key.index(token)] for token in content_tokens if token in word_vectors], axis=0)
    # 檢查是否有 NaN 值，如果有的話，則將其替換為 0
    recipe_avg_vec = np.nan_to_num(recipe_avg_vec)
    # 正規化向量（可選）
    recipe_avg_vec /= np.linalg.norm(recipe_avg_vec)
    recipe_similarity.append(np.dot(question_avg_vec, recipe_avg_vec))

# 抓相似度前三的食譜index以及similarity
# Get the top 3 results (recipes) that have high similarity
top3_indices = np.argsort(recipe_similarity)[::-1][:3]

for i, index in enumerate(top3_indices, 1):
    recipe_info = re.split('\[|\]', recipe[index])
    print(f"Recommendation {i} , The similarity is {recipe_similarity[index]}:")
    print('*Title:')
    print(recipe_info[0],'\n')
    print('*Ingredients:')
    print(recipe_info[1],'\n')
    print('*Instructions:')
    print(recipe_info[2],'\n')
    print('\n')

I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)?
---------------------------------------------------------------------------
Recommendation 1 , The similarity is 0.8685928583145142:
*Title:
African Ground Nut Stew  

*Ingredients:
'1 onion, diced', '2 to 3 pounds boneless chicken cut into chunks (I prefer thigh meat)', '1/2 jar natural peanut butter (no sugar or stabilizers added)', '1 can coconut milk (often in Asian grocery section)', '2 quart chicken stock, canned or homemade', '1 large bunch collard or other greens, chopped fairly finely and after removing center ribs (frozen, drained greens can be used as a substitute)', 'Sambal oelek, to taste (hot chili paste from Asian grocery section)', 'Cooked rice, as an accompaniment' 

*Instructions:
 Saute onions until translucent in a large saucepan. Add chicken pieces and saute until golden but not necessarily cooked through. Add coconut milk and stock and bring to a sim

##### word2vec的推薦結果肉眼可見的比TF-IDF差，但word2vec + SVD降維 的結果更差! 所以決定不用word2vec + SVD推薦了 / The result of Word2Vec seems terrible, and don't even mention Word2Vec + SVD! Thus, don't use Word2Vec apprach here

#試試 Count Vectorizer / Try Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# 轉vectorize
# Transform to vectorize
vectorizer = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words='english')
recipe_bow = vectorizer.fit_transform(recipe)
question_bow = vectorizer.transform([question])

# 算 Jaccard Similarity
# Calculte Jaccard Similarity
recipe_intersections = recipe_bow.multiply(question_bow).sum(axis=1)
recipe_unions = recipe_bow.sum(axis=1) + question_bow.sum(axis=1) - recipe_intersections
jaccard_similarity = recipe_intersections / recipe_unions

# 抓前三相似
# Get the top 3 results (recipes) that have high similarity
top_indices = jaccard_similarity.argsort(axis=0)[-3:][::-1]
top_indices = np.ravel(top_indices)

print(question)
print('---------------------------------------------------------------------------')

for i, index in enumerate(top_indices, 1):
    recipe_info = re.split('\[|\]', recipe[index])
    print(f"Recommendation {i} , The similarity is {jaccard_similarity[index][0]}:")
    print('*Title:')
    print(recipe_info[0],'\n')
    print('*Ingredients:')
    print(recipe_info[1],'\n')
    print('*Instructions:')
    print(recipe_info[2],'\n')
    print('\n')


I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)?
---------------------------------------------------------------------------
Recommendation 1 , The similarity is [[0.2962963]]:
*Title:
Soy Butter Sauce  

*Ingredients:
'1 tablespoon oyster sauce', '1 tablespoon soy sauce', '1 pound butter' 

*Instructions:
 Heat the oyster sauce and soy sauce and bring to a boil, then whisk in butter. 



Recommendation 2 , The similarity is [[0.23577236]]:
*Title:
Grilled Pork Chops with Bourbon-Mustard Glaze   

*Ingredients:
'1/3 cup bottled chili sauce', '1/4 cup bourbon', '1 1/2 tablespoons Dijon mustard', '1 1/2 tablespoons reduced-sodium soy sauce', '4 thin-cut pork rib chops (each about 1/4 to 1/3 inch thick)' 

*Instructions:
 Prepare barbecue (medium-high heat). Combine chili sauce, bourbon, mustard and soy sauce in heavy medium saucepan. Simmer over medium heat until sauce is reduced enough to coat spoon, whisking occasionall

# 結合 TF-IDF 和 CountVectorizer，輸出更好的推薦: / Combine TF-IDF and CountVectorizer to a mixed approach

In [None]:
cos_X = np.ravel(cos_X)
cos_X

array([0.05226535, 0.        , 0.066961  , ..., 0.1122359 , 0.0077773 ,
       0.01781115])

In [None]:
jaccard_similarity = np.ravel(jaccard_similarity)
jaccard_similarity

array([0.02702703, 0.        , 0.00952381, ..., 0.05687204, 0.00684932,
       0.015625  ])

In [None]:
similarity = cos_X + jaccard_similarity
similarity

array([0.07929238, 0.        , 0.07648481, ..., 0.16910794, 0.01462662,
       0.03343615])

In [None]:
# 抓前三相似
# Get the top 3 results (recipes) that have high similarity
top_indices = similarity.argsort(axis=0)[-3:]

print(top_indices)
print(question)
print('---------------------------------------------------------------------------')

for i, index in enumerate(top_indices, 1):
    recipe_info = re.split('\[|\]', recipe[index])
    print(f"Recommendation {i} , The similarity is {similarity[index]}:")
    print('*Title:')
    print(recipe_info[0],'\n')
    print('*Ingredients:')
    print(recipe_info[1],'\n')
    print('*Instructions:')
    print(recipe_info[2],'\n')
    print('\n')

[ 20566 106391  19739]
I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)?
---------------------------------------------------------------------------
Recommendation 1 , The similarity is 0.649810640831991:
*Title:
Sesame Ginger Sauce  

*Ingredients:
'2 tablespoons soy sauce ', '1 tablespoon Dijon mustard ', '1/4 teaspoon sesame oil ', '1/4 teaspoon grated fresh ginger root ', '2 1/2 teaspoons water ', '' 

*Instructions:
 In a small bowl, whisk together soy sauce, mustard, sesame oil, ginger root, and water.
 



Recommendation 2 , The similarity is 0.6499124871795621:
*Title:
Hot Apple-Ginger Toddy  

*Ingredients:
'6 oz. ginger-infused apple cider', 'Thin slice of fresh ginger or one piece of crystallized, candied ginger', '1 tsp. honey', "2 oz. bourbon (Maker's Mark or Jim Beam are preferred)", 'Slice of lemon' 

*Instructions:
 First - Ginger-infuse the cider: Peel and chop the ginger. Bring apple cider to a boil in 

# 試試先把料理分類再比對tf-idf的cosine similarity，並產出最後的model / Try categorizing dishes first, then compare the cosine similarity of TF-IDF, and generate the final model.

In [None]:
# 把recipe 逐詞拆開以利比對
# Categorize the recipe based on key words
recipe1 = recipe.apply(lambda x: x.split())

indian_cuisine = ['Indian']
mexican_cuisine = ['Mexican']
french_cuisine = ['French']
italian_cuisine = ['Italian']
japanese_cuisine = ['Japanese']
korean_cuisine = ['Korean']
spanish_cuisine = ['Spanish']
thai_cuisine = ['Thai']
american_cuisine = ['American']
chinese_cuisine = ['Chinese']


# 建立空的子recipe
# Create empty list
recipe_indian = []
recipe_mexican = []
recipe_french = []
recipe_italian = []
recipe_japanese = []
recipe_korean = []
recipe_spanish = []
recipe_thai = []
recipe_american = []
recipe_chinese = []

# classify cuisine
for index,word_list in enumerate(recipe1):
    # 轉小寫
    # Transform the words to lowercase
    lowercase_word_list = [word.lower() for word in word_list]

    # 分類recipe
    # Categorize
    if any(keyword.lower() in lowercase_word_list for keyword in indian_cuisine):
        recipe_indian.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in mexican_cuisine):
        recipe_mexican.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in french_cuisine):
        recipe_french.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in italian_cuisine):
        recipe_italian.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in japanese_cuisine):
        recipe_japanese.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in korean_cuisine):
        recipe_korean.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in spanish_cuisine):
        recipe_spanish.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in thai_cuisine):
        recipe_thai.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in american_cuisine):
        recipe_american.append(recipe[index])
    elif any(keyword.lower() in lowercase_word_list for keyword in chinese_cuisine):
        recipe_chinese.append(recipe[index])

recipe_indian = pd.Series(recipe_indian)
recipe_mexican = pd.Series(recipe_mexican)
recipe_french = pd.Series(recipe_french)
recipe_italian = pd.Series(recipe_italian)
recipe_japanese = pd.Series(recipe_japanese)
recipe_korean = pd.Series(recipe_korean)
recipe_spanish = pd.Series(recipe_spanish)
recipe_thai = pd.Series(recipe_thai)
recipe_american = pd.Series(recipe_american)
recipe_chinese = pd.Series(recipe_chinese)

In [None]:
recipe_chinese

0      Chinese Chicken Fried Rice II ['1 egg ', '1 ta...
1      Mama's Asian Chicken and Rice ['1/3 cup warm w...
2      Amber's Sesame Chicken ['1 cup all-purpose flo...
3      Asian Chicken Salad ['2 tablespoons brown suga...
4      Chinese Pepper Steak ['1 pound beef top sirloi...
                             ...                        
762    Seared Five-Spice Duck Breast with Snow Peas a...
763    Sweet and Sour Curry Spring Rolls ['1 pound sh...
764    Tea-Smoked Duck Legs with Mushroom and Orzo Ra...
765    Beef in Oyster Sauce ['12 ounces beef fillet (...
766    Coconut-Kaffir Leaf Poached Halibut with Saute...
Length: 767, dtype: object

In [None]:
# 保存到Google Drive以利後續使用
# Stored the data in drive for further use
recipe_indian.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_indian.csv', index=True)
recipe_mexican.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_mexican.csv', index=True)
recipe_french.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_french.csv', index=True)
recipe_italian.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_italian.csv', index=True)
recipe_japanese.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_japanese.csv', index=True)
recipe_korean.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_korean.csv', index=True)
recipe_spanish.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_spanish.csv', index=True)
recipe_thai.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_thai.csv', index=True)
recipe_american.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_american.csv', index=True)
recipe_chinese.to_csv('/content/drive/My Drive/Recipe Recommendation/recipe_chinese.csv', index=True)


In [None]:
import nltk
from nltk.corpus import stopwords

# 下载停用词集合 "english"
# download the stopwords
nltk.download('stopwords')

# 获取英文停用词列表
# get the stopwords
english_stopwords = stopwords.words('english')

# 添加额外的停用词
# extend the stopwords (as our experiments show that the word "make" are frequent but not included in the stopwords)
english_stopwords.append('make')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
english_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
# 事先對各子資料集進行tfidf轉換
# Perform TF-IDF transformation on each sub dataset in advance.
tfidf_ind = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_ind = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_ind.fit(recipe_indian)
recipe_ind = tfidf_ind.fit_transform(recipe_indian)
vectorizer_ind.fit(recipe_indian)
recipe_IND = vectorizer_ind.fit_transform(recipe_indian)
# 保存模型到文件
# store the models in my drive
joblib.dump(tfidf_ind, '/content/drive/My Drive/Recipe Recommendation/tfidf_ind.pkl')
joblib.dump(vectorizer_ind, '/content/drive/My Drive/Recipe Recommendation/vectorizer_ind.pkl')
joblib.dump(recipe_ind, '/content/drive/My Drive/Recipe Recommendation/recipe_ind.pkl')
joblib.dump(recipe_IND, '/content/drive/My Drive/Recipe Recommendation/recipe_IND.pkl')


# 2
tfidf_mex = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_mex = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_mex.fit(recipe_mexican)
vectorizer_mex.fit(recipe_mexican)
recipe_mex = tfidf_mex.fit_transform(recipe_mexican)
recipe_MEX = vectorizer_mex.fit_transform(recipe_mexican)
joblib.dump(tfidf_mex, '/content/drive/My Drive/Recipe Recommendation/tfidf_mex.pkl')
joblib.dump(vectorizer_mex, '/content/drive/My Drive/Recipe Recommendation/vectorizer_mex.pkl')
joblib.dump(recipe_mex, '/content/drive/My Drive/Recipe Recommendation/recipe_mex.pkl')
joblib.dump(recipe_MEX, '/content/drive/My Drive/Recipe Recommendation/recipe_MEX.pkl')


# 3
tfidf_fre = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_fre = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_fre.fit(recipe_french)
vectorizer_fre.fit(recipe_french)
recipe_fre = tfidf_fre.fit_transform(recipe_french)
recipe_FRE = vectorizer_fre.fit_transform(recipe_french)
joblib.dump(tfidf_fre, '/content/drive/My Drive/Recipe Recommendation/tfidf_fre.pkl')
joblib.dump(vectorizer_fre, '/content/drive/My Drive/Recipe Recommendation/vectorizer_fre.pkl')
joblib.dump(recipe_fre, '/content/drive/My Drive/Recipe Recommendation/recipe_fre.pkl')
joblib.dump(recipe_FRE, '/content/drive/My Drive/Recipe Recommendation/recipe_FRE.pkl')

# 4
tfidf_ita = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_ita = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_ita.fit(recipe_italian)
vectorizer_ita.fit(recipe_italian)
recipe_ita = tfidf_ita.fit_transform(recipe_italian)
recipe_ITA = vectorizer_ita.fit_transform(recipe_italian)
joblib.dump(tfidf_ita, '/content/drive/My Drive/Recipe Recommendation/tfidf_ita.pkl')
joblib.dump(vectorizer_ita, '/content/drive/My Drive/Recipe Recommendation/vectorizer_ita.pkl')
joblib.dump(recipe_ita, '/content/drive/My Drive/Recipe Recommendation/recipe_ita.pkl')
joblib.dump(recipe_ITA, '/content/drive/My Drive/Recipe Recommendation/recipe_ITA.pkl')


# 5
tfidf_jap = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_jap = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_jap.fit(recipe_japanese)
vectorizer_jap.fit(recipe_japanese)
recipe_jap = tfidf_jap.fit_transform(recipe_japanese)
recipe_JAP = vectorizer_jap.fit_transform(recipe_japanese)
joblib.dump(tfidf_jap, '/content/drive/My Drive/Recipe Recommendation/tfidf_jap.pkl')
joblib.dump(vectorizer_jap, '/content/drive/My Drive/Recipe Recommendation/vectorizer_jap.pkl')
joblib.dump(recipe_jap, '/content/drive/My Drive/Recipe Recommendation/recipe_jap.pkl')
joblib.dump(recipe_JAP, '/content/drive/My Drive/Recipe Recommendation/recipe_JAP.pkl')


# 6
tfidf_kor = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_kor = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_kor.fit(recipe_korean)
vectorizer_kor.fit(recipe_korean)
recipe_kor = tfidf_kor.fit_transform(recipe_korean)
recipe_KOR = vectorizer_kor.fit_transform(recipe_korean)
joblib.dump(tfidf_kor, '/content/drive/My Drive/Recipe Recommendation/tfidf_kor.pkl')
joblib.dump(vectorizer_kor, '/content/drive/My Drive/Recipe Recommendation/vectorizer_kor.pkl')
joblib.dump(recipe_kor, '/content/drive/My Drive/Recipe Recommendation/recipe_kor.pkl')
joblib.dump(recipe_KOR, '/content/drive/My Drive/Recipe Recommendation/recipe_KOR.pkl')


# 7
tfidf_spa = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_spa = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_spa.fit(recipe_spanish)
vectorizer_spa.fit(recipe_spanish)
recipe_spa = tfidf_spa.fit_transform(recipe_spanish)
recipe_SPA = vectorizer_spa.fit_transform(recipe_spanish)
joblib.dump(tfidf_spa, '/content/drive/My Drive/Recipe Recommendation/tfidf_spa.pkl')
joblib.dump(vectorizer_spa, '/content/drive/My Drive/Recipe Recommendation/vectorizer_spa.pkl')
joblib.dump(recipe_spa, '/content/drive/My Drive/Recipe Recommendation/recipe_spa.pkl')
joblib.dump(recipe_SPA, '/content/drive/My Drive/Recipe Recommendation/recipe_SPA.pkl')


# 8
tfidf_tha = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_tha = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_tha.fit(recipe_thai)
vectorizer_tha.fit(recipe_thai)
recipe_tha = tfidf_tha.fit_transform(recipe_thai)
recipe_THA = vectorizer_tha.fit_transform(recipe_thai)
joblib.dump(tfidf_tha, '/content/drive/My Drive/Recipe Recommendation/tfidf_tha.pkl')
joblib.dump(vectorizer_tha, '/content/drive/My Drive/Recipe Recommendation/vectorizer_tha.pkl')
joblib.dump(recipe_tha, '/content/drive/My Drive/Recipe Recommendation/recipe_tha.pkl')
joblib.dump(recipe_THA, '/content/drive/My Drive/Recipe Recommendation/recipe_THA.pkl')


# 9
tfidf_ame = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_ame = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_ame.fit(recipe_american)
vectorizer_ame.fit(recipe_american)
recipe_ame = tfidf_ame.fit_transform(recipe_american)
recipe_AME = vectorizer_ame.fit_transform(recipe_american)
joblib.dump(tfidf_ame, '/content/drive/My Drive/Recipe Recommendation/tfidf_ame.pkl')
joblib.dump(vectorizer_ame, '/content/drive/My Drive/Recipe Recommendation/vectorizer_ame.pkl')
joblib.dump(recipe_ame, '/content/drive/My Drive/Recipe Recommendation/recipe_ame.pkl')
joblib.dump(recipe_AME, '/content/drive/My Drive/Recipe Recommendation/recipe_AME.pkl')


# 10
tfidf_chi = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_chi = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_chi.fit(recipe_chinese)
vectorizer_chi.fit(recipe_chinese)
recipe_chi = tfidf_chi.fit_transform(recipe_chinese)
recipe_CHI = vectorizer_chi.fit_transform(recipe_chinese)
joblib.dump(tfidf_chi, '/content/drive/My Drive/Recipe Recommendation/tfidf_chi.pkl')
joblib.dump(vectorizer_chi, '/content/drive/My Drive/Recipe Recommendation/vectorizer_chi.pkl')
joblib.dump(recipe_chi, '/content/drive/My Drive/Recipe Recommendation/recipe_chi.pkl')
joblib.dump(recipe_CHI, '/content/drive/My Drive/Recipe Recommendation/recipe_CHI.pkl')

# 11
tfidf_all = TfidfVectorizer(lowercase=True, stop_words=english_stopwords, min_df=0.05, max_df=0.9, ngram_range = (1,3))
vectorizer_all = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b', stop_words=english_stopwords)
tfidf_all.fit(recipe)
vectorizer_all.fit(recipe)
recipe_all = tfidf_all.fit_transform(recipe)
recipe_ALL = vectorizer_all.fit_transform(recipe)
joblib.dump(tfidf_all, '/content/drive/My Drive/Recipe Recommendation/tfidf_all.pkl')
joblib.dump(vectorizer_all, '/content/drive/My Drive/Recipe Recommendation/vectorizer_all.pkl')
joblib.dump(recipe_all, '/content/drive/My Drive/Recipe Recommendation/recipe_all.pkl')
joblib.dump(recipe_ALL, '/content/drive/My Drive/Recipe Recommendation/recipe_ALL.pkl')

['/content/drive/My Drive/Recipe Recommendation/recipe_ALL.pkl']

In [None]:
# load the models from my drive
tfidf_indian = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_ind.pkl')
vectorizer_indian = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_ind.pkl')
recipe_ind = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_ind.pkl')
recipe_IND = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_IND.pkl')


tfidf_mexican = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_mex.pkl')
vectorizer_mexican = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_mex.pkl')
recipe_mex = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_mex.pkl')
recipe_MEX = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_MEX.pkl')


tfidf_french = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_fre.pkl')
vectorizer_french = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_fre.pkl')
recipe_fre = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_fre.pkl')
recipe_FRE = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_FRE.pkl')


tfidf_italian = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_ita.pkl')
vectorizer_italian = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_ita.pkl')
recipe_ita = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_ita.pkl')
recipe_ITA = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_ITA.pkl')


tfidf_japanese = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_jap.pkl')
vectorizer_japanese = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_jap.pkl')
recipe_jap = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_jap.pkl')
recipe_JAP = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_JAP.pkl')


tfidf_korean = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_kor.pkl')
vectorizer_korean = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_kor.pkl')
recipe_kor = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_kor.pkl')
recipe_KOR = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_KOR.pkl')


tfidf_spanish = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_spa.pkl')
vectorizer_spanish = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_spa.pkl')
recipe_spa = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_spa.pkl')
recipe_SPA = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_SPA.pkl')


tfidf_thai = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_tha.pkl')
vectorizer_thai = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_tha.pkl')
recipe_tha = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_tha.pkl')
recipe_THA = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_THA.pkl')


tfidf_american = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_ame.pkl')
vectorizer_american = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_ame.pkl')
recipe_ame = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_ame.pkl')
recipe_AME = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_AME.pkl')


tfidf_chinese = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_chi.pkl')
vectorizer_chinese = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_chi.pkl')
recipe_chi = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_chi.pkl')
recipe_CHI = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_CHI.pkl')


tfidf = joblib.load('/content/drive/My Drive/Recipe Recommendation/tfidf_all.pkl')
vectorizer = joblib.load('/content/drive/My Drive/Recipe Recommendation/vectorizer_all.pkl')
recipe_all = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_all.pkl')
recipe_ALL = joblib.load('/content/drive/My Drive/Recipe Recommendation/recipe_ALL.pkl')


In [None]:
# 讀取之前存好的子recipe
# load the recipes from my drive
recipe_indian = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_indian.csv')
recipe_mexican = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_mexican.csv')
recipe_french = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_french.csv')
recipe_italian = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_italian.csv')
recipe_japanese = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_japanese.csv')
recipe_korean = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_korean.csv')
recipe_spanish = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_spanish.csv')
recipe_thai = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_thai.csv')
recipe_american = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_american.csv')
recipe_chinese = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_chinese.csv')

In [None]:
recipe_italian['0']

0        Homemade Mac and Cheese Casserole ['8 ounces w...
1        World's Best Lasagna ['1 pound sweet Italian s...
2        Zesty Slow Cooker Chicken Barbecue ['6 frozen ...
3        Boilermaker Tailgate Chili ['2 pounds ground b...
4        Baked Ziti I ['1 pound dry ziti pasta ', '1 on...
                               ...                        
10527    Life Burger ['3 tablespoons honey or clear raw...
10528    Lemon Asparagus Risotto ['1 pound/500 g aspara...
10529    Fresh Mint Chip Gelato ['1 cup sugar', '2 tabl...
10530    Tuscan Tomato and Bread Soup - Pappa al Pomodo...
10531    Lamb and Eggplant Pastitsio ['1 large onion, c...
Name: 0, Length: 10532, dtype: object

In [None]:
# 創建一個dictionary方便核對question
# create a dictionary for comparison
cuisines = {
    "indian_cuisine": {
        "list": indian_cuisine,
        "tfidf": tfidf_indian,
        "vect": vectorizer_indian,
        "transform": recipe_ind,
        "transform1": recipe_IND,
        "dataset": recipe_indian
    },
    "mexican_cuisine": {
        "list": mexican_cuisine,
        "tfidf": tfidf_mexican,
        "vect": vectorizer_mexican,
        "transform": recipe_mex,
        "transform1": recipe_MEX,
        "dataset": recipe_mexican
    },
    "french_cuisine": {
        "list": french_cuisine,
        "tfidf": tfidf_french,
        "vect": vectorizer_french,
        "transform": recipe_fre,
        "transform1": recipe_FRE,
        "dataset": recipe_french
    },
    "italian_cuisine": {
        "list": italian_cuisine,
        "tfidf": tfidf_italian,
        "vect": vectorizer_italian,
        "transform": recipe_ita,
        "transform1": recipe_ITA,
        "dataset": recipe_italian
    },
    "japanese_cuisine": {
        "list": japanese_cuisine,
        "tfidf": tfidf_japanese,
        "vect": vectorizer_japanese,
        "transform": recipe_jap,
        "transform1": recipe_JAP,
        "dataset": recipe_japanese
    },
    "korean_cuisine": {
        "list": korean_cuisine,
        "tfidf": tfidf_korean,
        "vect": vectorizer_korean,
        "transform": recipe_kor,
        "transform1": recipe_KOR,
        "dataset": recipe_korean
    },
    "spanish_cuisine": {
        "list": spanish_cuisine,
        "tfidf": tfidf_spanish,
        "vect": vectorizer_spanish,
        "transform": recipe_spa,
        "transform1": recipe_SPA,
        "dataset": recipe_spanish
    },
    "thai_cuisine": {
        "list": thai_cuisine,
        "tfidf": tfidf_thai,
        "vect": vectorizer_thai,
        "transform": recipe_tha,
        "transform1": recipe_THA,
        "dataset": recipe_thai
    },
    "american_cuisine": {
        "list": american_cuisine,
        "tfidf": tfidf_american,
        "vect": vectorizer_american,
        "transform": recipe_ame,
        "transform1": recipe_AME,
        "dataset": recipe_american
    },
    "chinese_cuisine": {
        "list": chinese_cuisine,
        "tfidf": tfidf_chinese,
        "vect": vectorizer_chinese,
        "transform": recipe_chi,
        "transform1": recipe_CHI,
        "dataset": recipe_chinese
    },
}

In [None]:
question = input("What is in your mind?\n")
question_lower = question.lower()

# Iterate through cuisines and their corresponding datasets
for typ, cuisine in cuisines.items():
    for key_word in cuisine["list"]:
        if key_word.lower() in question_lower:
            X = cuisine["tfidf"].transform([question])
            Y = cuisine["vect"].transform([question])
            cosine_sim = np.ravel(cosine_similarity(X, cuisine["transform"]))
            recipe_intersections = cuisine["transform1"].multiply(Y).sum(axis=1)
            recipe_unions = cuisine["transform1"].sum(axis=1) + Y.sum(axis=1) - recipe_intersections
            jaccard_similarity = recipe_intersections / recipe_unions
            jaccard_sim = np.ravel(jaccard_similarity)
            sim = cosine_sim + jaccard_sim
            top_indices = np.argsort(sim)[-3:]
            print(question)
            print('---------------------------------------------------------------------------')
            for i, index in enumerate(top_indices, 1):
                recipe_info = re.split('\[|\]', cuisine["dataset"][index])
                print(f"Recommendation {i} , The similarity is {sim[index]}:")
                print('*Title:')
                print(recipe_info[0],'\n')
                print('*Ingredients:')
                print(recipe_info[1],'\n')
                print('*Instructions:')
                print(recipe_info[2],'\n')
                print('\n')
            break  # 一旦配到，立即跳出cusine 的 for loop
    else:  # 如果未配到，執行下面的code
        continue  # 往下一個cuisine測試匹配
    break  # 一旦配到，立即跳出整個for loop
else:  # 如果所有的子recipe都没有匹配到，從主recipe找全部
    X = tfidf.transform([question])
    Y = vectorizer.transform([question])
    cosine_sim = np.ravel(cosine_similarity(X, recipe_all))
    recipe_intersections = recipe_ALL.multiply(Y).sum(axis=1)
    recipe_unions = recipe_ALL.sum(axis=1) + Y.sum(axis=1) - recipe_intersections
    jaccard_similarity = recipe_intersections / recipe_unions
    jaccard_sim = np.ravel(jaccard_similarity)
    sim = cosine_sim + jaccard_sim
    top_indices = np.argsort(sim)[-3:]
    print(question)
    print('---------------------------------------------------------------------------')
    for i, index in enumerate(top_indices, 1):
        recipe_info = re.split('\[|\]', recipe[index])
        print(f"Recommendation {i} , The similarity is {sim[index]}:")
        print('*Title:')
        print(recipe_info[0],'\n')
        print('*Ingredients:')
        print(recipe_info[1],'\n')
        print('*Instructions:')
        print(recipe_info[2],'\n')
        print('\n')


What is in your mind?
I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)? I want to make it chinese
I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)? I want to make it chinese
---------------------------------------------------------------------------
Recommendation 1 , The similarity is 0.4135151488163223:
*Title:
Chinese Style Steamed Fish   

*Ingredients:
'2 6-ounce red snapper fillets', '2 tablespoons dry white wine', '1 1/2 teaspoons minced peeled fresh ginger', '2 small garlic cloves, minced', '4 teaspoons soy sauce', '1 1/2 teaspoons oriental sesame oil', '2 tablespoons chopped fresh cilantro' 

*Instructions:
 Place small cake rack in large (12-inch-diameter) skillet; place 9-inch-diameter glass pie dish on rack. Put fish in dish; sprinkle lightly with salt and pepper. Sprinkle wine, ginger and garlic in dish around fish. Top fish with soy sauce, se

# 統整所需的程式碼 / Summarize the codes needed for building my model into an user innterface

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from google.colab import drive




# 讀主recipe & 資料前處理
# read the main recipe and preprocess data
drive.mount('/content/drive')
data = pd.read_csv(r'/content/drive/MyDrive/all_recipe.csv')
recipe = data['title'].astype(str) + ' ' + data['ingredients'].astype(str) + ' ' + data['instructions'].astype(str)
recipe = [re.sub(r'\b\w*ADVERTISEMENT\w*\b', '', sentence) for sentence in recipe]
recipe = [''.join(words) for words in recipe]
recipe = pd.Series(recipe)
# 讀取之前存好的子recipe
# read the stored recipes
recipe_indian = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_indian.csv')
recipe_mexican = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_mexican.csv')
recipe_french = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_french.csv')
recipe_italian = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_italian.csv')
recipe_japanese = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_japanese.csv')
recipe_korean = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_korean.csv')
recipe_spanish = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_spanish.csv')
recipe_thai = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_thai.csv')
recipe_american = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_american.csv')
recipe_chinese = pd.read_csv('/content/drive/MyDrive/Recipe Recommendation/recipe_chinese.csv')




# 讀取事先訓練好的model
# load the stored models
tfidf_indian = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_ind.pkl')
vectorizer_indian = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_ind.pkl')
recipe_ind = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_ind.pkl')
recipe_IND = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_IND.pkl')

tfidf_mexican = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_mex.pkl')
vectorizer_mexican = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_mex.pkl')
recipe_mex = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_mex.pkl')
recipe_MEX = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_MEX.pkl')

tfidf_french = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_fre.pkl')
vectorizer_french = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_fre.pkl')
recipe_fre = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_fre.pkl')
recipe_FRE = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_FRE.pkl')

tfidf_italian = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_ita.pkl')
vectorizer_italian = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_ita.pkl')
recipe_ita = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_ita.pkl')
recipe_ITA = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_ITA.pkl')

tfidf_japanese = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_jap.pkl')
vectorizer_japanese = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_jap.pkl')
recipe_jap = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_jap.pkl')
recipe_JAP = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_JAP.pkl')

tfidf_korean = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_kor.pkl')
vectorizer_korean = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_kor.pkl')
recipe_kor = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_kor.pkl')
recipe_KOR = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_KOR.pkl')

tfidf_spanish = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_spa.pkl')
vectorizer_spanish = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_spa.pkl')
recipe_spa = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_spa.pkl')
recipe_SPA = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_SPA.pkl')

tfidf_thai = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_tha.pkl')
vectorizer_thai = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_tha.pkl')
recipe_tha = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_tha.pkl')
recipe_THA = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_THA.pkl')

tfidf_american = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_ame.pkl')
vectorizer_american = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_ame.pkl')
recipe_ame = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_ame.pkl')
recipe_AME = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_AME.pkl')

tfidf_chinese = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_chi.pkl')
vectorizer_chinese = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_chi.pkl')
recipe_chi = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_chi.pkl')
recipe_CHI = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_CHI.pkl')

tfidf = joblib.load('/content/drive/MyDrive/Recipe Recommendation/tfidf_all.pkl')
vectorizer = joblib.load('/content/drive/MyDrive/Recipe Recommendation/vectorizer_all.pkl')
recipe_all = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_all.pkl')
recipe_ALL = joblib.load('/content/drive/MyDrive/Recipe Recommendation/recipe_ALL.pkl')

indian_cuisine = ['Indian'] #'Curry', 'Tandoori Chicken', 'Biryani', 'Naan', 'Samosa', 'Masala Dosa', 'Chole Bhature', 'Rogan Josh',
mexican_cuisine = ['Mexican'] #'Tacos', 'Burritos', 'Enchiladas', 'Guacamole', 'Quesadillas', 'Fajitas', 'Salsa', 'Mole',
french_cuisine = ['French'] #'Croissant', 'Baguette', 'Coq au Vin', 'Beef Bourguignon', 'Ratatouille', 'Escargot', 'Bouillabaisse', 'Quiche Lorraine',
italian_cuisine = ['Italian'] #'Pizza', 'Pasta', 'Risotto', 'Lasagna', 'Tiramisu', 'Gelato', 'Bruschetta', 'Caprese Salad',
japanese_cuisine = ['Japanese'] #'Sushi', 'Sashimi', 'Ramen', 'Tempura', 'Yakitori', 'Miso Soup', 'Tonkatsu', 'Matcha',
korean_cuisine = ['Korean'] #'Kimchi', 'Bibimbap', 'Bulgogi', 'Kimbap', 'Tteokbokki', 'Japchae', 'Samgyeopsal', 'Galbi',
spanish_cuisine = ['Spanish'] #'Paella', 'Tapas', 'Gazpacho', 'Tortilla Española', 'Churros', 'Jamón Ibérico', 'Sangria', 'Patatas Bravas',
thai_cuisine = ['Thai'] #'Pad Thai', 'Tom Yum Goong', 'Green Curry', 'Som Tum (Papaya Salad)', 'Massaman Curry', 'Khao Pad (Fried Rice)', 'Pad See Ew', 'Mango Sticky Rice',
american_cuisine = ['American'] #'Hamburger', 'Hotdog', 'BBQ Ribs', 'Fried Chicken', 'Apple Pie', 'Macaroni and Cheese', 'Clam Chowder', 'Pancakes',
chinese_cuisine = ['Chinese'] #'Kung Pao Chicken', 'Peking Duck', 'Dim Sum', 'Mapo Tofu', 'Spring Rolls', 'Fried Rice', 'Chow Mein', 'Hot Pot', 'Dumplings',


# 創建一個dictionary方便核對question
# create a dictionary for comparison
cuisines = {
    "indian_cuisine": {
        "list": indian_cuisine,
        "tfidf": tfidf_indian,
        "vect": vectorizer_indian,
        "transform": recipe_ind,
        "transform1": recipe_IND,
        "dataset": recipe_indian
    },
    "mexican_cuisine": {
        "list": mexican_cuisine,
        "tfidf": tfidf_mexican,
        "vect": vectorizer_mexican,
        "transform": recipe_mex,
        "transform1": recipe_MEX,
        "dataset": recipe_mexican
    },
    "french_cuisine": {
        "list": french_cuisine,
        "tfidf": tfidf_french,
        "vect": vectorizer_french,
        "transform": recipe_fre,
        "transform1": recipe_FRE,
        "dataset": recipe_french
    },
    "italian_cuisine": {
        "list": italian_cuisine,
        "tfidf": tfidf_italian,
        "vect": vectorizer_italian,
        "transform": recipe_ita,
        "transform1": recipe_ITA,
        "dataset": recipe_italian
    },
    "japanese_cuisine": {
        "list": japanese_cuisine,
        "tfidf": tfidf_japanese,
        "vect": vectorizer_japanese,
        "transform": recipe_jap,
        "transform1": recipe_JAP,
        "dataset": recipe_japanese
    },
    "korean_cuisine": {
        "list": korean_cuisine,
        "tfidf": tfidf_korean,
        "vect": vectorizer_korean,
        "transform": recipe_kor,
        "transform1": recipe_KOR,
        "dataset": recipe_korean
    },
    "spanish_cuisine": {
        "list": spanish_cuisine,
        "tfidf": tfidf_spanish,
        "vect": vectorizer_spanish,
        "transform": recipe_spa,
        "transform1": recipe_SPA,
        "dataset": recipe_spanish
    },
    "thai_cuisine": {
        "list": thai_cuisine,
        "tfidf": tfidf_thai,
        "vect": vectorizer_thai,
        "transform": recipe_tha,
        "transform1": recipe_THA,
        "dataset": recipe_thai
    },
    "american_cuisine": {
        "list": american_cuisine,
        "tfidf": tfidf_american,
        "vect": vectorizer_american,
        "transform": recipe_ame,
        "transform1": recipe_AME,
        "dataset": recipe_american
    },
    "chinese_cuisine": {
        "list": chinese_cuisine,
        "tfidf": tfidf_chinese,
        "vect": vectorizer_chinese,
        "transform": recipe_chi,
        "transform1": recipe_CHI,
        "dataset": recipe_chinese
    },
}




# 分析使用者的輸入並比對 (主要的模型輸出code)
# make comparison and print the results
question = input("What is in your mind?\n")
question_lower = question.lower()

for typ, cuisine in cuisines.items():
    for key_word in cuisine["list"]:
        if key_word.lower() in question_lower:
            X = cuisine["tfidf"].transform([question])
            Y = cuisine["vect"].transform([question]).reshape(1, -1)
            cosine_sim = np.ravel(cosine_similarity(X, cuisine["transform"]))
            recipe_intersections = cuisine["transform1"].multiply(Y).sum(axis=1)
            recipe_unions = cuisine["transform1"].sum(axis=1) + Y.sum(axis=1) - recipe_intersections
            jaccard_similarity = recipe_intersections / recipe_unions
            jaccard_sim = np.ravel(jaccard_similarity)
            sim = cosine_sim + jaccard_sim
            top_indices = np.argsort(sim)[-3:]
            print(question)
            print('---------------------------------------------------------------------------')
            for i, index in enumerate(top_indices, 1):
                recipe_info = re.split('\[|\]', cuisine["dataset"]['0'][index])
                print(f"Recommendation {i} , The similarity is {sim[index]}:")
                print('*Title:')
                print(recipe_info[0],'\n')
                print('*Ingredients:')
                print(recipe_info[1],'\n')
                print('*Instructions:')
                print(recipe_info[2],'\n')
                print('\n')
            break
    else:
        continue
    break
else:
    X = tfidf.transform([question])
    Y = vectorizer.transform([question])
    cosine_sim = np.ravel(cosine_similarity(X, recipe_all))
    recipe_intersections = recipe_ALL.multiply(Y).sum(axis=1)
    recipe_unions = recipe_ALL.sum(axis=1) + Y.sum(axis=1) - recipe_intersections
    jaccard_similarity = recipe_intersections / recipe_unions
    jaccard_sim = np.ravel(jaccard_similarity)
    sim = cosine_sim + jaccard_sim
    top_indices = np.argsort(sim)[-3:]
    print(question)
    print('---------------------------------------------------------------------------')
    for i, index in enumerate(top_indices, 1):
        recipe_info = re.split('\[|\]', recipe[index])
        print(f"Recommendation {i} , The similarity is {sim[index]}:")
        print('*Title:')
        print(recipe_info[0],'\n')
        print('*Ingredients:')
        print(recipe_info[1],'\n')
        print('*Instructions:')
        print(recipe_info[2],'\n')
        print('\n')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
What is in your mind?
I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)? I want to make it chinese
I have tofu and soy sauce and ginger. Can you recommend a savory dish that can be cooked in medium (30-60 minutes)? I want to make it chinese
---------------------------------------------------------------------------
Recommendation 1 , The similarity is 0.4135151488163223:
*Title:
Chinese Style Steamed Fish   

*Ingredients:
'2 6-ounce red snapper fillets', '2 tablespoons dry white wine', '1 1/2 teaspoons minced peeled fresh ginger', '2 small garlic cloves, minced', '4 teaspoons soy sauce', '1 1/2 teaspoons oriental sesame oil', '2 tablespoons chopped fresh cilantro' 

*Instructions:
 Place small cake rack in large (12-inch-diameter) skillet; place 9-inch-diameter glass pie dish on rack. Put fish 

In [None]:
recipe_italian

0      Unnamed: 0                                                    0
       0             Homemade Mac and Cheese Casserole ['8 ounces w...
1      Unnamed: 0                                                    1
       0             World's Best Lasagna ['1 pound sweet Italian s...
2      Unnamed: 0                                                    2
                                           ...                        
10529  0             Fresh Mint Chip Gelato ['1 cup sugar', '2 tabl...
10530  Unnamed: 0                                                10530
       0             Tuscan Tomato and Bread Soup - Pappa al Pomodo...
10531  Unnamed: 0                                                10531
       0             Lamb and Eggplant Pastitsio ['1 large onion, c...
Length: 21064, dtype: object

In [None]:
print("Transform1 shape:", cuisine["transform1"].shape)
print("Y shape:", Y.transpose().shape)

Transform1 shape: (10532, 12366)
Y shape: (12229, 1)
