In [1]:
import numpy as np 
import pandas as pd 
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
df = pd.read_csv('../merged_file.csv')

In [5]:
df.head()

Unnamed: 0,nama,porsi,kalori,lemak,karbs,protein,sodium,kolesterol,kategori,UniqueID,overall
0,"Krim Sup Ayam (Kalengan, Terkondensasi)",1 mangkok,223.0,1448,1797,597,136,5133,ayam,1,"Krim Sup Ayam (Kalengan, Terkondensasi)ayam"
1,Belalang Goreng,100 gram (g),151.0,645,834,143,94,3536,ayam,2,Belalang Gorengayam
2,Getuk Goreng,100 gram (g),359.0,668,7406,131,3496,3846,ayam,3,Getuk Gorengayam
3,Bakso Goreng,1 buah,57.0,377,239,322,24,1663,ayam,4,Bakso Gorengayam
4,Paha Ayam Panggang (Kulit Dimakan),"1 kecil (hasil setelah masak, tulang dihilangkan)",135.0,845,0,1367,0,3354,ayam,5,Paha Ayam Panggang (Kulit Dimakan)ayam


In [6]:
df['UniqueID'].dtype

dtype('int32')

In [9]:
# Declaring the 'id' column as integer
df['UniqueID'] = df['UniqueID'].astype('int')

In [10]:
# fd = df[df['UniqueID']]
df.shape

(8800, 11)

In [11]:
df['kalori'] = df['kalori'].astype(str)

In [12]:
df['overall'] = df['nama'] + df['kategori'] #+ fd['kalori']# Combining columns overview and tagline

In [3]:
# Converts a collection of raw documents to a matrix of TF-IDF features
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df = 0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['overall'])

In [5]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
df = df.reset_index(drop=True)
titles = df['nama']  # Defining a new variable title
indices = pd.Series(df.index, index = df['nama'])  # Defining a new dataframe indices

In [8]:
df.head()

In [9]:
# Defining a function that returns 30 most similar food bases on the cosine 
# similarity score
def get_recommendations(title):
    idx = indices[title]  # Defining a variable with indices
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1: 31]  # Taking the 30 most similar foods
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]  # returns the title based on food indices

In [10]:
# Convert a collection of text documents to a matrix of token counts
count = CountVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')
count_matrix = count.fit_transform(df['overall'])

In [11]:
# Compute cosine similarity between samples in X and Y.
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [13]:
get_recommendations('Ayam Goreng')

95                                     Kulit Ayam Goreng
96                                   Piattos Ayam Goreng
103                                    Bumbu Ayam Goreng
107                                     Usus Ayam Goreng
85                               Bumbu Racik Ayam Goreng
88                                 Nori Rasa Ayam Goreng
78                          Ripik Hijau Rasa Ayam Goreng
46                      Daging Paha Ayam (Ayam Pedaging)
47                      Daging Dada Ayam (Ayam Pedaging)
32                                Daging Ayam (Panggang)
30           Daging dan Kulit Sayap Ayam (Ayam Pedaging)
31            Daging Paha dan Kulit Ayam (Ayam Pedaging)
53     Daging Paha Ayam (Ayam Pedaging, Dipanggang, D...
54     Daging Dada Ayam (Ayam Pedaging, Dipanggang, D...
55            Daging Dada dan Kulit Ayam (Ayam Pedaging)
5                             Kuah/Kaldu Ayam (Kalengan)
21                            Sup Ayam Kental (Kalengan)
220                           A

In [28]:
with open('preprocessed_data.pkl', 'wb') as file:
    pickle.dump(fd, file)

In [29]:
# Load the preprocessed data and the get_recommendations function
with open('preprocessed_data.pkl', 'rb') as file:
    preprocessed_data = pickle.load(file)