## Delacre combinations of interest

### Basic imports

In [1]:
import os
import nltk
from nltk import FreqDist
import pprint
import treetaggerwrapper
import collections
import numpy as np
import pandas as pd

### Configure Stanford POS tagger

In [2]:
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag

homepath = r'C:\Users\peter\Documents\Tools\stanford-postagger-full-2017-06-09'
taggerpath = os.path.join(homepath,r'models\french.tagger')
jarpath = os.path.join(homepath,'stanford-postagger.jar')

st = POS_Tag(taggerpath,jarpath)

def extract_tokens_noun_adj(text):
    # tokenize and tag using Stanford POS tagger:
    tokenized_text = nltk.tokenize.word_tokenize(txt)
    tagged_text = st.tag(tokenized_text)
    tokens_stanford_noun = [tag[0].lower() for tag in tagged_text if (tag[1][0] == 'N' )]
    tokens_stanford_adj = [tag[0].lower() for tag in tagged_text if ( tag[1] == 'ADJ')]
    
    return tokens_stanford_noun,tokens_stanford_adj

V			V		indicative verb

VS			V		subjonctive verb

VINF		V		infinitival verb

VPP			V		past participle

VPR			V		present participle

VIMP		V		imperative verb

NC			N		common noun

NPP			N		propoer noun

CS			C		coordinating conjunction

CC			C		subordinating conjunction (complementizer)

CLS			CL		nominative clitic

CLO			CL		accusative or dative clitic

CLR			CL		reflexive clitic (whether its interpretation is truly reflexive or not)

P			P		non amalgamated preposition

P+D			P+D		prep+determiner amalgam

P+PRO		P+PRO	prep+relative pronoun amalgam (auquel (to which))

I			I		interjection

PONCT		PONCT		punctuation

ET			ET		foreign words, with a POS that is not guessable from context

ADJ			A		non interrogative adjectives

ADJWH		A		interrogative adjectives

ADV			ADV		non interrogative adjectives

ADVWH		ADV		interrogative adjectives

PRO			PRO		neither relative nor interrogative pronouns

PROREL		PRO		relative pronouns

PROWH		PRO		interrogative pronouns

DET			D		non interrogative nor relative determiners

DETWH		D		interrogative or relative determiners

### Configure TreeTagger
which supposedly both tags and lemmatizes your text:

In [69]:
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
lemma_docs= []

java_path = r"C:\Program Files (x86)\Java\jre1.8.0_144\bin\java.exe"
os.environ['JAVAHOME'] = java_path

def extract_lemma_noun_adj(txt):
    try:
        tags = treetaggerwrapper.make_tags(tagger.tag_text(txt))
        
        lemmas_noun = [tag.lemma.lower() for tag in tags if (type(tag) == treetaggerwrapper.Tag) and (tag.pos == 'NOM')]
        lemmas_adj = [tag.lemma.lower() for tag in tags if (type(tag) == treetaggerwrapper.Tag) and (tag.pos =='ADJ')]
        
        return lemmas_noun,lemmas_adj
    except:
        print("error")
            

### Define metric

In [4]:
def calc_metric(application,applications):
    
    # calc P(ingr)
    
    flat_list = [item for sublist in applications for item in sublist]
    cnt_freqdist = collections.Counter(flat_list)
    
    metrics = []
    
    # calc P(ingr|application)
    
    application_freqdist = collections.Counter(application)
        
    for ingredient in application_freqdist.keys():
            
        p_ingr_app = (application_freqdist[ingredient] / len(application))
        p_ingr = (cnt_freqdist[ingredient] / len(flat_list))
            
        metric = p_ingr_app * ((p_ingr_app - p_ingr) / (1 - p_ingr))
        
        metrics.append([ingredient,metric])
        
    headers = ["ingredient",'score']
    df = pd.DataFrame(metrics, columns=headers).sort_values(by = 'score',ascending = False)
    return df

### Define ingredient classifier

In [5]:
from nltk.corpus import wordnet as wn
any_in = lambda a, b: any(i in b for i in a)    
        
def find_food(candidate,language):
    full_hierarchy_list = []
    
    # core idea: find the full lexicology path for any synonym of the word.
    # if somewhere in the upward path, the candidate arrives at a node 'food', 'plant' or 'ingredient':
    # then we have a fairly certain match for a true ingredient
    
    for synset in wn.synsets(candidate,lang=language):
        hierarchy = wn.synset(synset.name())
        hierarchy_list = (list(hierarchy.closure(lambda s: s.hypernyms())))
        distill_words = [x.name().split('.')[0] for x in hierarchy_list]
        if len(full_hierarchy_list) == 0:
            full_hierarchy_list = distill_words
        else:
            full_hierarchy_list += distill_words
    return any_in(full_hierarchy_list,['food','plant','ingredient'] )

### Test on sample dataset

In [6]:
# define datapath:
recipe_path = r'C:\Users\peter\Documents\GitHub\datascience\nlp\French NLP\marmitton'

# define ingredient of interest, with their respective application of interest
combo_of_interest =  ['beurre','sel']

# we will create a list with applications of interest: those who contain these two tags
applications_of_interest = []

for filename in os.listdir(recipe_path):
    with open(os.path.join(recipe_path,filename),'r') as f:
        
        txt = f.read()
        tokens_stanford_noun, tokens_stanford_adj = extract_tokens_noun_adj(txt)
        lemmas_treetagger_noun, lemmas_treetagger_adj = extract_lemma_noun_adj(txt)

        # for now, only keep the TreeTagger output, which seems to perform better anyway     
        # in this step: perform filtering to only obtain foods
        lemmas_food = [candidate for candidate in lemmas_treetagger_noun if find_food(candidate,'fra') == True]
        
        # if the tags of interest can be found: pass it on:
        if all(x in lemmas_food for x in combo_of_interest):
            applications_of_interest.append(lemmas_food)  
        
# for every ingredient in the application of interest: calculate the metrics, which identifies the 'most relevant' ingredients per relevant application:
for application in applications_of_interest:
    print(calc_metric(application,applications_of_interest))
        

    ingredient     score
2       beurre  0.008986
1   champignon  0.007188
7     bouillon  0.005213
3          riz  0.003162
4        purée  0.003162
10         jus  0.003162
11      citron  0.003162
0       poulet  0.002500
6       farine  0.002500
12       sauce  0.001135
8        poule  0.000782
14      poivre  0.000782
5         roux  0.000104
9          eau  0.000104
15      viande -0.000968
13         sel -0.001722
   ingredient     score
6      viande  0.003308
12        œuf  0.002698
13      crème  0.002698
3         eau  0.002075
4      légume  0.002075
8        roux  0.002075
14   moutarde  0.001439
18        sel  0.001320
15   bouillon  0.000790
0      épaule  0.000668
16    mélange  0.000668
7   casserole  0.000668
11      jaune  0.000668
1        veau  0.000352
19     poulet  0.000031
9      farine  0.000031
5      oignon  0.000031
2        cube  0.000031
10     beurre -0.000552
17      sauce -0.000634
   ingredient     score
6        pain  0.004246
7        bœuf  0.004246

In [7]:
find_food('coffee','eng')

True

#### Info: synset languages:

In [8]:
sorted(wn.langs())

['als',
 'arb',
 'bul',
 'cat',
 'cmn',
 'dan',
 'ell',
 'eng',
 'eus',
 'fas',
 'fin',
 'fra',
 'glg',
 'heb',
 'hrv',
 'ind',
 'ita',
 'jpn',
 'nno',
 'nob',
 'pol',
 'por',
 'qcn',
 'slv',
 'spa',
 'swe',
 'tha',
 'zsm']

### Process Marmiton subset

In [70]:
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client['local']

results = db.marmiton_local.aggregate([
    {"$match": {"ingredients.by_line" : {"$exists" : True, "$not" : {"$size": 0}}}},
    {'$project': { 'ingredients.by_line' : 1 ,'_id':0}}
])

# define ingredient of interest, with their respective application of interest
combo_of_interest =  ['tomate','oignon']

applications_of_interest = []

for result in results:
    text = " ".join(result["ingredients"]['by_line'])
    
    lemmas_treetagger_noun, lemmas_treetagger_adj = extract_lemma_noun_adj(text)

    # for now, only keep the TreeTagger output, which seems to perform better anyway     
    # in this step: perform filtering to only obtain foods
    lemmas_food = [candidate for candidate in lemmas_treetagger_noun if find_food(candidate,'fra') == True]
        
    # if the tags of interest can be found: pass it on:
    if all(x in lemmas_food for x in combo_of_interest):
        applications_of_interest.append(lemmas_food)  
        

   


for every ingredient in the application of interest: calculate the metrics, which identifies the 'most relevant' ingredients per relevant application:


In [72]:
metrics = []

for application in applications_of_interest:
    metrics.append(calc_metric(application,applications_of_interest))


### Min-max scaling on the metrics

In [97]:
full_metrics = pd.concat(metrics)
max_score = full_metrics["score"].max()
min_score = full_metrics["score"].min()

print(max_score)
print(min_score)

for metric in metrics:
    metric['normalized'] = ((metric["score"] - min_score) / (max_score - min_score))

0.112634919753
-0.00211520365081


In [100]:
print(metrics[599])

  ingredient     score  normalized
0       coco  0.039751    0.364849
4     poivre  0.033918    0.314014
3        sel  0.030150    0.281180
2     oignon  0.026449    0.248923
1     tomate  0.024590    0.232721


In [101]:
print(metrics[300])

   ingredient     score  normalized
11     cerise  0.006819    0.077855
4      lardon  0.006742    0.077183
7    moutarde  0.006714    0.076939
9       herbe  0.006502    0.075099
1         eau  0.006439    0.074545
8     basilic  0.006361    0.073867
0      farine  0.006276    0.073125
2      beurre  0.005725    0.068328
5       olive  0.003560    0.049461
3         sel  0.002242    0.037970
6      oignon  0.000475    0.022570
10     tomate -0.000413    0.014834


### Score offset
This allows an element like coco, which appears in an application where the words of interest are very prominent, to be classified higher than for example 'cerise', which is the top in an application where the words of interest are less prominent

In [117]:
for metric in metrics:
    tomato_value = metric.loc[metric['ingredient'] == 'tomate']['normalized'].values[0]
    oignon_value = metric.loc[metric['ingredient'] == 'oignon']['normalized'].values[0]
    
    metric["ofset_score"] = metric['normalized'] * (tomato_value + oignon_value) / (1 + tomato_value - oignon_value)    
    

In [132]:
print(metrics[599])

  ingredient     score  normalized  ofset_score
0       coco  0.039751    0.364849     0.178621
4     poivre  0.033918    0.314014     0.153734
3        sel  0.030150    0.281180     0.137659
2     oignon  0.026449    0.248923     0.121867
1     tomate  0.024590    0.232721     0.113934


### Group and sort
to obtain the best matching ingredients with 'tomato' and 'oignon'.
The group by is necessary since for example 'pain' appeared multiple times in the list

In [131]:
full_metrics = pd.concat(metrics).groupby(['ingredient']).max().sort_values(by = 'ofset_score',ascending = False)

In [130]:
print(full_metrics.head(100))

               score  normalized  ofset_score
ingredient                                   
oignon      0.046620    0.424704     0.324472
chevreuil   0.039985    0.366889     0.179620
poulpe      0.039956    0.366635     0.179495
lentille    0.039927    0.366380     0.179371
mayonnaise  0.039824    0.365487     0.178934
steak       0.039795    0.365232     0.178809
coco        0.039751    0.364849     0.178621
lard        0.039678    0.364210     0.178308
ciboulette  0.039649    0.363954     0.178183
fromage     0.039310    0.361006     0.176740
pain        0.039089    0.359077     0.175795
sauce       0.038897    0.357401     0.174975
pomme       0.038704    0.355720     0.174152
carotte     0.038287    0.352087     0.172374
œuf         0.038093    0.350394     0.171545
piment      0.038018    0.349742     0.171225
thym        0.037958    0.349220     0.170970
bouquet     0.037838    0.348174     0.170458
vin         0.037099    0.341736     0.167306
poivron     0.070106    0.629375  