# Example of different information needs

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import os
import nltk

In [12]:
folder = "/Users/piera/Desktop/Repository/IR/dataset/recipee"
files = [f for f in os.listdir(folder) if f.endswith('.txt')]
recipes = []
for file in files: 
    with open(os.path.join(folder, file), 'r') as data: 
        recipes.append(data.read())

## Tokenizers

### NLTK Tokenizer

In [13]:
nltk_tokenize = lambda text: [x.lower() for x in ntlk.word_tokenize(text)]

### Spacy Tokenizer and Parsers

In [19]:
import spacy
nlp = spacy.load("en_core_web_sm")

#### Example
Recipes do not have a real sentence structure. Thus, we use a special tokenizer for sentences based on newlines. The last chunk is tipically the set of instructions. 

In [24]:
spacy_sentences = [sentence for sentence in nlp(recipes[0]).sents]
newline_sentences = [r.strip('\n') for r in recipes[0].split('\n\n')]

['Salt Free, Low Cholesterol Sugar Cookies Recipe', 'Ingredients', ' - 1/2 cup of sugars, granulated', ' - 3/4 cup of oil, corn, peanut, and olive', ' - 1/4 cup of egg substitute, powder', ' - 1 teaspoon of orange juice, raw', ' - 1/4 teaspoon of orange juice, raw', ' - 1 tablespoon of leavening agents, baking powder, double-acting, sodium aluminum sulfate', ' - 3 1/2 cup of wheat flour, white, all-purpose, unenriched', 'Instructions: ', 'Cream sugar and butter together till smooth. Add in egg beaters, orange rind, orange juice, and mix well. Mix together low sodium baking powder and flour. Add in to creamed mix and mix well. Roll dough into 1 inch balls and place on ungreased cookie sheet. Rub small amount of salt free butter on bottom of glass. Dip glass in granulated sugar. Flatten cookie dough ball slightly using flat end of glass. Bake at 300 degrees for 10-12 min.']


In [22]:
instructions_text = newline_sentences[-1]
print(instructions_text)
instruction_sentences = list(nlp(instructions_text).sents)

Cream sugar and butter together till smooth. Add in egg beaters, orange rind, orange juice, and mix well. Mix together low sodium baking powder and flour. Add in to creamed mix and mix well. Roll dough into 1 inch balls and place on ungreased cookie sheet. Rub small amount of salt free butter on bottom of glass. Dip glass in granulated sugar. Flatten cookie dough ball slightly using flat end of glass. Bake at 300 degrees for 10-12 min.


In [27]:
sentence = instruction_sentences[0]

type(sentence)

spacy.tokens.span.Span

In [28]:
sentence

Cream sugar and butter together till smooth.

In [29]:
tokens = []
for token in sentence: 
    tokens.append({
        'position': token.idx, 
        'text': token.text, 
        'pos': token.pos_, 
        'lemma': token.lemma_, 
        'alpha': token.is_alpha, 
        'stop': token.is_stop, 
        'dep': token.dep_, 
        'morph': token.morph
    })
    
S = pd.DataFrame(tokens)

In [30]:
print(sentence)
S

Cream sugar and butter together till smooth.


Unnamed: 0,position,text,pos,lemma,alpha,stop,dep,morph
0,0,Cream,NOUN,cream,True,False,compound,(Number_sing)
1,6,sugar,NOUN,sugar,True,False,ROOT,(Number_sing)
2,12,and,CCONJ,and,True,True,cc,(ConjType_comp)
3,16,butter,NOUN,butter,True,False,conj,(Number_sing)
4,23,together,ADV,together,True,True,advmod,(Degree_pos)
5,32,till,SCONJ,till,True,False,prep,()
6,37,smooth,VERB,smooth,True,False,pobj,(VerbForm_inf)
7,43,.,PUNCT,.,False,False,punct,(PunctType_peri)


In [31]:
from spacy.displacy import render

In [32]:
render(sentence)