In [0]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
def read_json_df(path='/yummly.json'):
  df = pd.read_json(path)
  return df

In [0]:
def generate_tf_idf(df,input_ingredient):
  ingredients_list = df['ingredients']
  #ingredients_list.append(input_ingredient[0])
  ingredients_corpus = []
  for l in ingredients_list:
    s = ' '.join(l)
    ingredients_corpus.append(s)
  #print(len(ingredients_corpus))
  input_string = ' '.join(input_ingredient)
  ingredients_corpus.append(input_string)
  #print(ingredients_corpus[:10])
  tfidf_vectorizer = TfidfVectorizer(min_df=1)
  doc_matrix = tfidf_vectorizer.fit_transform(ingredients_corpus)
  return doc_matrix

In [0]:
def prediction(df,doc_matrix):
  labels = df['cuisine']
  doc_matrix_with_out_input = doc_matrix[:-1]
  #print(doc_matrix_with_out_input.shape)
  input_vector= doc_matrix[-1]
  clf = RandomForestClassifier()
  model_pickle=clf.fit(doc_matrix_with_out_input,labels)
  y_pred = clf.predict(input_vector)
  return y_pred

In [0]:
def top_n_similar(doc_matrix,df,n_top=8):
  doc_matrix_with_out_input = doc_matrix[:-1]
  input_vector= doc_matrix[-1]
  scores = cosine_similarity(input_vector,doc_matrix_with_out_input)
  sorted_scores = scores[0].argsort()[::-1][:n_top]
  scores_indices = [ (index,round(scores[0][index],4)) for index in sorted_scores ]
  ids = list(df['id'])
  cuisines = list(df['cuisine'])
  closest_cusines = []
  for index,score in scores_indices:
    closest_cusines.append((ids[index],score,cuisines[index]))
  return closest_cusines

**Reading the json_data to dataframe.**

In [0]:
df = read_json_df(None)

**Some statistics of df**

In [9]:
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [10]:
df.columns

Index(['id', 'cuisine', 'ingredients'], dtype='object')

In [11]:
df.isna().any()

id             False
cuisine        False
ingredients    False
dtype: bool

**Input Ingredient list.**

In [0]:
input_ingredient = [
    "chili powder",
    "pepper",
    "butter",
    "bread",
    "chicken",
    "lettuce"
  ]

**Feature matrix generation**

In [0]:
doc_matrix = generate_tf_idf(df,input_ingredient)

**Checking doc_matrix type and shape**

In [41]:
print(type(doc_matrix))
print(doc_matrix.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(39775, 3010)


**Calling Prediction method to predict the cuisine type**

In [0]:
ypred = prediction(df,doc_matrix)

**Printing the predicted cuisine type**

In [43]:
ypred

array(['mexican'], dtype=object)

**Calling method to get the Closest cuisines**

In [0]:
closest_cuisines=top_n_similar(doc_matrix,df)

**Closet Cuisines list with ids and scores**

In [45]:
closest_cuisines

[(33393, 0.4467, 'mexican'),
 (19277, 0.4421, 'russian'),
 (42954, 0.4191, 'mexican'),
 (15753, 0.4139, 'russian'),
 (34743, 0.4015, 'thai'),
 (31220, 0.3963, 'mexican'),
 (13975, 0.3945, 'french'),
 (37087, 0.3857, 'indian')]

In [47]:
print("Input Ingredients - ",input_ingredient)
print("Predicted cuisine type for above ingredients -",ypred)
print("Closet 8 cuisines with scores (id,score,cuisine) -",closest_cuisines)

Input Ingredients -  ['chili powder', 'pepper', 'butter', 'bread', 'chicken', 'lettuce']
Predicted cuisine type for above ingredients - ['mexican']
Closet 8 cuisines with scores (id,score,cuisine) - [(33393, 0.4467, 'mexican'), (19277, 0.4421, 'russian'), (42954, 0.4191, 'mexican'), (15753, 0.4139, 'russian'), (34743, 0.4015, 'thai'), (31220, 0.3963, 'mexican'), (13975, 0.3945, 'french'), (37087, 0.3857, 'indian')]


**Testing for some other inputs**

In [0]:
input_ingredient = [ "rice","chicken","pepper","tomatoes"]

In [0]:
doc_matrix = generate_tf_idf(df,input_ingredient)
ypred = prediction(df,doc_matrix)
closest_cuisines=top_n_similar(doc_matrix,df)

**Prediction of cuisine and closest cuisines along with scores & ids.**

In [31]:
print("Input Ingredients - ",input_ingredient)
print("Predicted cuisine type for above ingredients -",ypred)
print("Closet 8 cuisines with scores -",closest_cuisines)

Input Ingredients -  ['rice', 'chicken', 'pepper', 'tomatoes']
Predicted cuisine type for above ingredients - ['southern_us']
Closet 8 cuisines with scores - [(22720, 0.5912, 'chinese'), (34064, 0.5229, 'brazilian'), (5084, 0.5061, 'mexican'), (23617, 0.4841, 'mexican'), (12414, 0.4765, 'southern_us'), (25256, 0.466, 'indian'), (44076, 0.4588, 'japanese'), (25776, 0.4583, 'mexican')]
