In [None]:
import nltk
nltk.download('book')
from nltk.book import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/dependency_treebank.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


# Collocations (Colocaciones)

* Son secuencias de palabras que suelen ocurrir en textos o conversaciones con una **frecuencia inusualmente alta** [NLTK doc](http://www.nltk.org/book/ch01.html)
* Las colocaciones de una palabra son declaraciones formales de donde suele ubicarse tipicamente esa palabra [Manning & Schütze, 1990, Foundations of Statistical Natural Language Processing, Capítulo 6](https://nlp.stanford.edu/fsnlp/)

In [None]:
md_bigrams = list(bigrams(text1))

threshold = 2
#distribution of bi-grams
filtered_bigrams = [bigram for bigram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold]
filtered_bigram_dist = FreqDist(filtered_bigrams)
#distribution of words
filtered_words = [word for word in text1 if len(word)>threshold]
filtered_word_dist = FreqDist(filtered_words)

df = pd.DataFrame()
df['bi_gram'] = list(set(filtered_bigrams))
df['word_0'] = df['bi_gram'].apply(lambda x: x[0])
df['word_1'] = df['bi_gram'].apply(lambda x: x[1])
df['bi_gram_freq'] = df['bi_gram'].apply(lambda x: filtered_bigram_dist[x])
df['word_0_freq'] = df['word_0'].apply(lambda x: filtered_word_dist[x])
df['word_1_freq'] = df['word_1'].apply(lambda x: filtered_word_dist[x])

In [None]:
df.head(10)

Unnamed: 0,bi_gram,word_0,word_1,bi_gram_freq,word_0_freq,word_1_freq
0,"(over, empty)",over,empty,1,403,22
1,"(DOWN, FROM)",DOWN,FROM,1,2,10
2,"(generally, unknown)",generally,unknown,1,30,36
3,"(power, may)",power,may,1,37,230
4,"(been, sleeping)",been,sleeping,1,415,20
5,"(impairs, beauty)",impairs,beauty,1,1,8
6,"(brother, Bildad)",brother,Bildad,1,10,76
7,"(popular, sense)",popular,sense,1,8,14
8,"(began, slashing)",began,slashing,1,53,2
9,"(was, marking)",was,marking,1,1632,3


In [None]:
df['PMI'] = df[['bi_gram_freq','word_0_freq','word_1_freq']].apply(lambda x: np.log2(x.values[0]/(x.values[1]*x.values[2])), axis=1)
df['log(bi_gram_freq)'] = df['bi_gram_freq'].apply(lambda x: np.log2(x))
df

Unnamed: 0,bi_gram,word_0,word_1,bi_gram_freq,word_0_freq,word_1_freq,PMI,log(bi_gram_freq)
0,"(over, empty)",over,empty,1,403,22,-13.114068,0.0
1,"(DOWN, FROM)",DOWN,FROM,1,2,10,-4.321928,0.0
2,"(generally, unknown)",generally,unknown,1,30,36,-10.076816,0.0
3,"(power, may)",power,may,1,37,230,-13.054943,0.0
4,"(been, sleeping)",been,sleeping,1,415,20,-13.018896,0.0
...,...,...,...,...,...,...,...,...
67937,"(plentiful, than)",plentiful,than,1,1,309,-8.271463,0.0
67938,"(such, words)",such,words,1,336,28,-13.199672,0.0
67939,"(either, whale)",either,whale,1,39,906,-15.108769,0.0
67940,"(you, some)",you,some,2,841,578,-17.890888,1.0


In [None]:
df.sort_values('PMI', ascending=False)

Unnamed: 0,bi_gram,word_0,word_1,bi_gram_freq,word_0_freq,word_1_freq,PMI
2974,"(magniloquent, ascriptions)",magniloquent,ascriptions,1,1,1,0.000000
3374,"(Matse, Avatar)",Matse,Avatar,1,1,1,0.000000
51746,"(omnisciently, exhaustive)",omnisciently,exhaustive,1,1,1,0.000000
32221,"(Orion, glitters)",Orion,glitters,1,1,1,0.000000
40005,"(Ephesian, sod)",Ephesian,sod,1,1,1,0.000000
...,...,...,...,...,...,...,...
30154,"(man, the)",man,the,1,508,13721,-22.732783
20521,"(some, the)",some,the,1,578,13721,-22.919024
15157,"(one, the)",one,the,1,889,13721,-23.540138
50884,"(the, not)",the,not,1,13721,1103,-23.851315


Estos datos nos representan un pequeño problema, y es que a pesar de que el PMI es bastante cercano a cero, las frecuencias de estas palabras nos hace pensar que es necesario establecer una nueva métrica para poder hacer un filtro extra que nos permita saber cuales bigramas aparecen más y tienen el PMI cercano a cero.


CONSTRUCCIÓN DEL GRÁFICO INTERACTIVO

In [None]:
fig = px.scatter(x = df['PMI'], y= df['log(bi_gram_freq)'], color=df['PMI']+df['log(bi_gram_freq)'], hover_name=df['bi_gram'].values, width=600, height=600, labels = {'x':'PMI', 'y': 'Log(bi_gram_freq)'} )
fig.show()

## Medidas pre-construidas en NLTK



In [None]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures() # Podemos hacer uso de PMI con una herramienta de NLTK
finder = BigramCollocationFinder.from_words(text1) # Nos permite a partir de las palabras del texto usar una clase para encontrar las colocaciones

finder.apply_freq_filter(20) # todo lo que tenga una frecuencia menor a 20 no lo va a considerar
finder.nbest(bigram_measures.pmi,10) # Recibe el pmi de los bigramas y el numero de colocaciones que se quieren mostrar

[('Moby', 'Dick'),
 ('Sperm', 'Whale'),
 ('White', 'Whale'),
 ('Right', 'Whale'),
 ('Captain', 'Peleg'),
 (',"', 'said'),
 ('never', 'mind'),
 ('!"', 'cried'),
 ('no', 'means'),
 ('each', 'other')]

## TEXTOS EN ESPAÑOL

In [None]:
nltk.download('cess_esp')
corpus = nltk.corpus.cess_esp.sents()
flatten_corpus = [word for l in corpus for word in l]


[nltk_data] Downloading package cess_esp to /root/nltk_data...
[nltk_data]   Package cess_esp is already up-to-date!


In [None]:
print(flatten_corpus[:50])

['El', 'grupo', 'estatal', 'Electricité_de_France', '-Fpa-', 'EDF', '-Fpt-', 'anunció', 'hoy', ',', 'jueves', ',', 'la', 'compra', 'del', '51_por_ciento', 'de', 'la', 'empresa', 'mexicana', 'Electricidad_Águila_de_Altamira', '-Fpa-', 'EAA', '-Fpt-', ',', 'creada', 'por', 'el', 'japonés', 'Mitsubishi_Corporation', 'para', 'poner_en_marcha', 'una', 'central', 'de', 'gas', 'de', '495', 'megavatios', '.', 'Una', 'portavoz', 'de', 'EDF', 'explicó', 'a', 'EFE', 'que', 'el', 'proyecto']


In [None]:
finder = BigramCollocationFinder.from_documents(corpus)
finder.apply_freq_filter(10)
finder.nbest(bigram_measures.pmi,10)

[('señora', 'Aguirre'),
 ('secretario', 'general'),
 ('elecciones', 'generales'),
 ('campaña', 'electoral'),
 ('quiere', 'decir'),
 ('Se', 'trata'),
 ('segunda', 'vuelta'),
 ('director', 'general'),
 ('primer', 'ministro'),
 ('primer', 'lugar')]