## French NLP function tests

### Basic imports

In [2]:
import os
import nltk
from nltk import FreqDist

### Download NLTK

In [3]:
#nltk.download()

### Import dataset

In [4]:
datapath = r'C:\Users\peter\Documents\Tools\corpus\french\wikipedia.txt'

### Stanford POS tagger

In [5]:
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag

homepath = r'C:\Users\peter\Documents\Tools\stanford-postagger-full-2017-06-09'
taggerpath = os.path.join(homepath,r'models\french.tagger')
jarpath = os.path.join(homepath,'stanford-postagger.jar')

st = POS_Tag(taggerpath,jarpath)

V			V		indicative verb

VS			V		subjonctive verb

VINF		V		infinitival verb

VPP			V		past participle

VPR			V		present participle

VIMP		V		imperative verb

NC			N		common noun

NPP			N		propoer noun

CS			C		coordinating conjunction

CC			C		subordinating conjunction (complementizer)

CLS			CL		nominative clitic

CLO			CL		accusative or dative clitic

CLR			CL		reflexive clitic (whether its interpretation is truly reflexive or not)

P			P		non amalgamated preposition

P+D			P+D		prep+determiner amalgam

P+PRO		P+PRO	prep+relative pronoun amalgam (auquel (to which))

I			I		interjection

PONCT		PONCT		punctuation

ET			ET		foreign words, with a POS that is not guessable from context

ADJ			A		non interrogative adjectives

ADJWH		A		interrogative adjectives

ADV			ADV		non interrogative adjectives

ADVWH		ADV		interrogative adjectives

PRO			PRO		neither relative nor interrogative pronouns

PROREL		PRO		relative pronouns

PROWH		PRO		interrogative pronouns

DET			D		non interrogative nor relative determiners

DETWH		D		interrogative or relative determiners

### Lemma's
Might prove a bit difficult, so let's try with Treetagger, which supposedly both tags and lemmatizes your text:

In [28]:
import os
import pprint
import treetaggerwrapper
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
lemma_docs= []

java_path = r"C:\Program Files (x86)\Java\jre1.8.0_144\bin\java.exe"
os.environ['JAVAHOME'] = java_path


recipe_path = r'C:\Users\peter\Documents\GitHub\datascience\nlp\French NLP\marmitton'
for filename in os.listdir(recipe_path):
    with open(os.path.join(recipe_path,filename),'r') as f:
        
        file = f.read()
        
        # tokenize and tag using Stanford POS tagger:
        
        tokenized_text = nltk.tokenize.word_tokenize(file)
        tagged_text = st.tag(tokenized_text)
        tokens_stanford = [tag[0].lower() for tag in tagged_text if (tag[1][0] == 'N' or tag[1] == 'ADJ')]
        
        # tokenize, tag and lemmatize using TeeTagger:
        
        tags = treetaggerwrapper.make_tags(tagger.tag_text(file))
        lemmas = [tag.lemma.lower() for tag in tags if (tag.pos == 'NOM' or tag.pos =='ADJ')]
        
        
        # for now, only keep the TreeTagger output, which seems to perform better anyway      
        
        lemma_docs.append(lemmas) 

Apply metric

In [29]:
import collections
import numpy as np
import pandas as pd


def calc_metric(application,applications):
    
    # calc P(ingr)
    
    flat_list = [item for sublist in applications for item in sublist]
    cnt_freqdist = collections.Counter(flat_list)
    
    metrics = []
    
    # calc P(ingr|application)
    
    application_freqdist = collections.Counter(application)
        
    for ingredient in application_freqdist.keys():
            
        p_ingr_app = (application_freqdist[ingredient] / len(application))
        p_ingr = (cnt_freqdist[ingredient] / len(flat_list))
            
        metric = p_ingr_app * ((p_ingr_app - p_ingr) / (1 - p_ingr))
        
        metrics.append([ingredient,metric])
        
    headers = ["ingredient",'score']
    df = pd.DataFrame(metrics, columns=headers).sort_values(by = 'score',ascending = False)
    return df

In [30]:
for application in lemma_docs:
    print(calc_metric(application,lemma_docs))


     ingredient     score
0         etape  0.005901
11       beurre  0.002313
3         petit  0.002187
7    champignon  0.001507
20     bouillon  0.001130
25       citron  0.000668
15          riz  0.000668
19        litre  0.000668
24          jus  0.000605
16        purée  0.000605
18       farine  0.000542
8      foi|fois  0.000478
2        poulet  0.000414
29        sauce  0.000350
4       morceau  0.000221
21        poule  0.000167
28     rajouter  0.000167
27      grumeau  0.000167
26       poêlon  0.000167
38    garniture  0.000167
6         temps  0.000167
34      bouchée  0.000167
35  préparation  0.000167
14      cuisson  0.000167
13          fin  0.000167
12        cesse  0.000167
10        faire  0.000167
9          tête  0.000167
37          min  0.000167
31       poivre  0.000135
5       pendant  0.000135
30          sel  0.000135
1         blanc  0.000135
23    bouillant  0.000103
17         roux  0.000103
39          bol  0.000071
33          feu  0.000038
22          