In [121]:
from elasticsearch import Elasticsearch, helpers
import json
import pandas as pd
import numpy as np

In [173]:
df = pd.read_parquet('recipes.parquet')

In [153]:
df.isna().sum()

RecipeId                           0
Name                               0
AuthorId                           0
AuthorName                         0
CookTime                       82545
PrepTime                           0
TotalTime                          0
DatePublished                      0
Description                        5
Images                             1
RecipeCategory                   751
Keywords                           0
RecipeIngredientQuantities         0
RecipeIngredientParts              0
AggregatedRating              253223
ReviewCount                   247489
Calories                           0
FatContent                         0
SaturatedFatContent                0
CholesterolContent                 0
SodiumContent                      0
CarbohydrateContent                0
FiberContent                       0
SugarContent                       0
ProteinContent                     0
RecipeServings                182911
RecipeYield                   348071
R

In [155]:
df.loc[df['RecipeCategory'].isna()]['RecipeCategory']

0         Frozen Desserts
1          Chicken Breast
2               Beverages
3                Soy/Tofu
4               Vegetable
               ...       
522512            Dessert
522513     Very Low Carbs
522514          Ice Cream
522515           Canadian
522516          Breakfast
Name: RecipeCategory, Length: 522517, dtype: object

In [103]:
df[['AggregatedRating','ReviewCount']].sort_values(by='ReviewCount', ascending=False).fillna(0)

Unnamed: 0,AggregatedRating,ReviewCount
41924,5.0,3063.0
1436,5.0,2273.0
23725,5.0,1692.0
84057,4.5,1657.0
35377,5.0,1586.0
...,...,...
522512,0.0,0.0
522513,0.0,0.0
522514,0.0,0.0
522515,0.0,0.0


In [117]:
df.loc[df['RecipeServings'].isna()]['RecipeServings']

8        NaN
9        NaN
12       NaN
15       NaN
23       NaN
          ..
522505   NaN
522506   NaN
522511   NaN
522515   NaN
522516   NaN
Name: RecipeServings, Length: 182911, dtype: float64

In [148]:
df.iloc[522505].RecipeIngredientParts

array(['green chilies', 'cheddar cheese', 'eggs',
       'half & half light cream', 'milk', 'Wondra Flour', 'salt'],
      dtype=object)

In [149]:
df.loc[df['Images'].isna()]

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions


In [159]:
for row in df.loc[df['Images'].isna()].index:
    print(df.loc[row, 'Images'])

None


In [174]:
cleaned_df = df
cleaned_df['CookTime'] = cleaned_df['CookTime'].fillna('')
cleaned_df['AggregatedRating'] = cleaned_df['AggregatedRating'].fillna(0)
cleaned_df['ReviewCount'] = cleaned_df['ReviewCount'].fillna(0)
cleaned_df['Description'] = cleaned_df['Description'].fillna('')
cleaned_df['RecipeCategory'] = cleaned_df['RecipeCategory'].fillna('')
cleaned_df['RecipeServings'] = cleaned_df['RecipeServings'].fillna(0)
cleaned_df['RecipeYield'] = cleaned_df['RecipeYield'].fillna('0')
for row in cleaned_df.loc[cleaned_df['Images'].isna()].index:
    cleaned_df.at[row, 'Images'] = []
cleaned_df.isna().sum()


RecipeId                      0
Name                          0
AuthorId                      0
AuthorName                    0
CookTime                      0
PrepTime                      0
TotalTime                     0
DatePublished                 0
Description                   0
Images                        0
RecipeCategory                0
Keywords                      0
RecipeIngredientQuantities    0
RecipeIngredientParts         0
AggregatedRating              0
ReviewCount                   0
Calories                      0
FatContent                    0
SaturatedFatContent           0
CholesterolContent            0
SodiumContent                 0
CarbohydrateContent           0
FiberContent                  0
SugarContent                  0
ProteinContent                0
RecipeServings                0
RecipeYield                   0
RecipeInstructions            0
dtype: int64

In [175]:
cleaned_df['RecipeYield'].values

array(['0', '0', '0', ..., '0', '20 rolls', '0'], dtype=object)

In [176]:
cleaned_df.to_parquet('no_na_recipes.parquet.gzip', compression='gzip')

In [177]:
class es_indexer:
    def __init__(self):
        self.df = pd.read_parquet('no_na_recipes.parquet.gzip')
        self.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic","DEq+yKIoJag7b_ZEJl4W"), ca_certs="~/http_ca.crt")

    def run_indexer(self):
        self.es_client.indices.create(index='foodir', ignore=400)
        self.es_client.indices.delete(index='foodir', ignore=[400, 404])
        self.df['_index'] = 'foodir'
        # j = json.loads(self.df[['_index','RecipeId','Name','RecipeIngredientParts','RecipeInstructions']].to_json(orient='records'))
        j = json.loads(self.df.to_json(orient='records'))
        helpers.bulk(self.es_client, j)


In [178]:
es = es_indexer()


In [179]:
es.run_indexer()

  self.es_client.indices.create(index='foodir', ignore=400)
  self.es_client.indices.delete(index='foodir', ignore=[400, 404])


In [127]:
def queryEslPr(query):
    results = es.es_client.search(index='foodir', query={
            "combined_fields": {
                "query": query,
                "fields": ["Name","RecipeIngredientParts","RecipeInstructions"]
            }
        }, suggest_field='Name', suggest_text=query, suggest_mode='missing')
    print("Got %d Hits:" % results['hits']['total']['value'])
    for hit in results['hits']['hits']:
        print("The recipe is '{0}' '{1}' '{2}'.".format(hit['_source']["RecipeId"],hit['_source']["Name"],hit['_score']))
    for op in results['suggest']['Name']:
        print(op['options'])


In [7]:
es.es_client.search(index='foodir', suggest_field='Name', suggest_text='very blur', suggest_mode='missing')['suggest']

{'Name': [{'text': 'very', 'offset': 0, 'length': 4, 'options': []},
  {'text': 'blur',
   'offset': 5,
   'length': 4,
   'options': [{'text': 'blue', 'score': 0.75, 'freq': 2003},
    {'text': 'bluer', 'score': 0.75, 'freq': 1},
    {'text': 'blui', 'score': 0.75, 'freq': 1},
    {'text': 'bour', 'score': 0.75, 'freq': 1},
    {'text': 'blu', 'score': 0.6666666, 'freq': 4}]}]}

In [184]:
res = es.es_client.search(index='foodir', query={
            "script_score":{
                "query":{
                    "combined_fields": {
                        "query": 'Chicken',
                        "fields": ["Name","RecipeIngredientParts","RecipeInstructions"],
                    },
                },
                "script":{
                    "source": "_score"
                }
            }
        }, suggest_field='Name', suggest_text='Chicken', suggest_mode='missing')

In [183]:
cleaned_df.loc[41924]

RecipeId                                                                45809.0
Name                                                            Bourbon Chicken
AuthorId                                                                  58278
AuthorName                                                             LinMarie
CookTime                                                                  PT20M
PrepTime                                                                  PT15M
TotalTime                                                                 PT35M
DatePublished                                         2002-11-12 20:13:00+00:00
Description                   I searched and finally found this recipe on th...
Images                        [https://img.sndimg.com/food/image/upload/w_55...
RecipeCategory                                                   Chicken Breast
Keywords                      [Chicken, Poultry, Meat, Chinese, Asian, High ...
RecipeIngredientQuantities     [2, 1 -2,

In [196]:
cleaned_df.loc[cleaned_df['ReviewCount'] > 10]['ReviewCount'].sort_values(ascending=False).describe()

41924     3063.0
1436      2273.0
23725     1692.0
84057     1657.0
35377     1586.0
           ...  
147453      11.0
147528      11.0
147898      11.0
148030      11.0
517122      11.0
Name: ReviewCount, Length: 24019, dtype: float64

In [199]:
cleaned_df['AggregatedRating'].describe()

count    522517.000000
mean          2.387240
std           2.360339
min           0.000000
25%           0.000000
50%           3.000000
75%           5.000000
max           5.000000
Name: AggregatedRating, dtype: float64

In [187]:
es.es_client.search(index='foodir', query={
            "script_score":{
                "query":{
                    "combined_fields": {
                        "query": 'bourbon chicken',
                        "fields": ["Name","RecipeIngredientParts","RecipeInstructions"],
                    },
                },
                "script":{
                    "source": "_score * (doc['ReviewCount'].value + 1)"
                }
            }
        }, suggest_field='Name', suggest_text='bourbon chicken', suggest_mode='missing')['hits']['hits']

[{'_index': 'foodir',
  '_id': 'DTqiRo4Bgb-E19kqbTfp',
  '_score': 42108.34,
  '_ignored': ['Description.keyword'],
  '_source': {'RecipeId': 45809.0,
   'Name': 'Bourbon Chicken',
   'AuthorId': 58278,
   'AuthorName': 'LinMarie',
   'CookTime': 'PT20M',
   'PrepTime': 'PT15M',
   'TotalTime': 'PT35M',
   'DatePublished': 1037131980000,
   'Description': "I searched and finally found this recipe on the internet. It is a copycat of the Bourbon Chicken sold in Chinese carry-outs in my hometown.  This recipe is so good that my sons gobble it up leaving me just a spoonful. Their excuse was they thought I had eaten.  Editor's Note:  Named Bourbon Chicken because it was supposedly created by a Chinese cook who worked in a restaurant on Bourbon Street.",
   'Images': ['https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/45/80/9/MwuCd6HpQ5mDvn4OLRkA_0S9A9886.jpg',
    'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/i

In [140]:
res['hits']['hits'][0]['_source']

{'RecipeId': 31992.0,
 'Name': 'Crab Stuffed Mushrooms',
 'AuthorId': 43006,
 'AuthorName': 'busy mom',
 'CookTime': 'PT4M',
 'PrepTime': 'PT15M',
 'TotalTime': 'PT19M',
 'DatePublished': 1024951800000,
 'Description': "These are the most delicious mushrooms I've ever had. This recipe was passed down from my mother-in-law. They are holiday favorites",
 'Images': [],
 'RecipeCategory': 'Crab',
 'Keywords': ['Vegetable', 'Microwave', '< 30 Mins'],
 'RecipeIngredientQuantities': ['1', '1', '1', '1⁄4', '1', '8', '3⁄4'],
 'RecipeIngredientParts': ['mushroom',
  'onion',
  'butter',
  'chive & onion cream cheese'],
 'AggregatedRating': 5.0,
 'ReviewCount': 1.0,
 'Calories': 291.9,
 'FatContent': 28.8,
 'SaturatedFatContent': 18.1,
 'CholesterolContent': 82.2,
 'SodiumContent': 225.2,
 'CarbohydrateContent': 5.4,
 'FiberContent': 1.0,
 'SugarContent': 2.1,
 'ProteinContent': 5.5,
 'RecipeServings': None,
 'RecipeYield': None,
 'RecipeInstructions': ['Remove stems carefully from mushroom.',
  

In [130]:
import numpy as np
pd.DataFrame([[hit['_source'][key] for key in hit['_source']] for hit in res['hits']['hits']], columns=list(es.df.columns))

ValueError: 29 columns passed, passed data had 28 columns

In [235]:
queryEslPr("ginger gingerbread")

Got 10000 Hits:
The recipe is '323457.0' 'Caramel Gingerbread Cupcakes' '15.352105'.
The recipe is '537189.0' 'Gingerbread Men in a Bubble Bath' '14.838849'.
The recipe is '271246.0' 'Gingerbread Man Cookies' '14.809714'.
The recipe is '352209.0' 'Pumpkin Ginger-Gingerbread' '14.727834'.
The recipe is '271513.0' 'Spiced Christmas Gingerbread Coffee With Cognac Chantilly Cream' '14.554615'.
The recipe is '139766.0' 'Gingerbread Tres Leches Cake' '14.4893055'.
The recipe is '200439.0' 'Gingerbread Martini' '14.427593'.
The recipe is '106063.0' 'Raisin Gingerbread Loaf With Ginger Icing' '14.410472'.
The recipe is '148588.0' 'Gingerbread' '14.402438'.
The recipe is '143485.0' 'Warm Gingerbread With Lemon Glaze' '14.399534'.
[]
[]


In [220]:
json.loads(df.loc[df['RecipeId'] == 349677].to_json(orient='records'))

[{'RecipeId': 349677.0,
  'Name': 'Hominy Casserole',
  'AuthorId': 107135,
  'AuthorName': 'Dienia B.',
  'CookTime': 'PT30M',
  'PrepTime': 'PT30M',
  'TotalTime': 'PT1H',
  'DatePublished': 1231889340000,
  'Description': 'Make and share this Hominy Casserole recipe from Food.com.',
  'Images': [],
  'RecipeCategory': 'Corn',
  'Keywords': ['Vegetable', '< 60 Mins'],
  'RecipeIngredientQuantities': ['1',
   '1',
   '1⁄2',
   '1',
   '1',
   '1⁄2',
   '1',
   '1',
   '1'],
  'RecipeIngredientParts': ['hominy',
   'pepper',
   'salt',
   'celery seed',
   'butter'],
  'AggregatedRating': None,
  'ReviewCount': None,
  'Calories': 444.7,
  'FatContent': 30.1,
  'SaturatedFatContent': 8.2,
  'CholesterolContent': 27.4,
  'SodiumContent': 1616.0,
  'CarbohydrateContent': 35.8,
  'FiberContent': 6.4,
  'SugarContent': 5.5,
  'ProteinContent': 10.9,
  'RecipeServings': 4.0,
  'RecipeYield': None,
  'RecipeInstructions': ['drain hominy',
   'mix in cream of mushroon soup and half and half',