# Test # 1 Word Segmentation
**Tomado de:** Norvig, P.: Natural language corpus data. In: Beautiful Data, pp. 219–242 (2009)

URL
* https://norvig.com/ngrams/
* https://norvig.com/ngrams/ch14.pdf

In [1]:
import re, string, random, glob, operator, heapq, functools
from collections import defaultdict
from math import log10

In [2]:
def memo(f):
    "Memoize function f."
    table = {}
    def fmemo(*args):
        if args not in table:
            table[args] = f(*args)
        return table[args]
    fmemo.memo = table
    return fmemo

### 1st Version

In [3]:
@memo
def segment(text):
    "Return a list of words that is the best segmentation of text."
    if not text: return []
    candidates = ([first]+segment(rem) for first,rem in splits(text))    
    return max(candidates, key=Pwords)

def splits(text, L=20):
    "Return a list of all possible (first, rem) pairs, len(first)<=L."
    return [(text[:i+1], text[i+1:]) 
            for i in range(min(len(text), L))]

def Pwords(words): 
    "The Naive Bayes probability of a sequence of words."
    return product(Pw(w) for w in words)

In [4]:
#### Support functions (p. 224)

def product(nums):
    "Return the product of a sequence of numbers."
    return functools.reduce(operator.mul, nums, 1)

class Pdist(dict):
    "A probability distribution estimated from counts in datafile."
    def __init__(self, data=[], N=None, missingfn=None):
        for key,count in data:
            self[key] = self.get(key, 0) + int(count)
        self.N = float(N or sum(self.itervalues()))
        self.missingfn = missingfn or (lambda k, N: 1./N)
    def __call__(self, key): 
        if key in self: return self[key]/self.N  
        else: return self.missingfn(key, self.N)

def datafile(name, sep='\t'):
    "Read key,value pairs from file."
    for line in open(name):
        yield line.split(sep)

def avoid_long_words(key, N):
    "Estimate the probability of an unknown word."
    return 10./(N * 10**len(key))

In [26]:
N = 1024908267229 ## Number of tokens
Pw  = Pdist(datafile('count_1w.txt'), N, avoid_long_words)

In [27]:
segment('wheninthecourseofhumaneventsitbecomesnecessary')

['when',
 'in',
 'the',
 'course',
 'of',
 'human',
 'events',
 'it',
 'becomes',
 'necessary']

In [34]:
segment('inaholeinthegroundtherelivedahobbitnotanastydirtywetholefilledwiththeendsofwormsandanoozysmellnoryetadrybaresandyholewithnothinginittositdownonortoeatitwasahobbitholeandthatmeanscomfort')

['in',
 'a',
 'hole',
 'in',
 'the',
 'ground',
 'there',
 'lived',
 'a',
 'hobbit',
 'not',
 'a',
 'nasty',
 'dirty',
 'wet',
 'hole',
 'filled',
 'with',
 'the',
 'ends',
 'of',
 'worms',
 'and',
 'an',
 'oozy',
 'smell',
 'nor',
 'yet',
 'a',
 'dry',
 'bare',
 'sandy',
 'hole',
 'with',
 'nothing',
 'in',
 'it',
 'to',
 'sitdown',
 'on',
 'or',
 'to',
 'eat',
 'it',
 'was',
 'a',
 'hobbit',
 'hole',
 'and',
 'that',
 'means',
 'comfort']

### Spanish

In [9]:
samples.iloc[1][1]

'otraperiodista'

In [13]:
import sys
sys.path.insert(0, '../../../')

from classes.wordsegmentation import WordSegmentation

dir_ = "../../../data/v1/NER/"
file_segmentation = dir_+'spanish_count_1w_small_v2_twitter.txt'
segmentation = WordSegmentation(file_segmentation)

import pandas as pd
samples = pd.read_csv('output-words-generator-v2.csv')

result = []
for i in range(len(samples[['original']])):    
    pre = segmentation.segment(samples.iloc[i][1])
    text = ' '.join(pre)
    #result.append([samples.iloc[i][0],text])
    result.append(text)

samples['v3_twitter'] = result
samples.to_csv('output-words-generator-v3-twitter.csv')

In [12]:
result

['por la via x la vida',
 'otra periodista',
 'gen mas energia',
 'el divino nene',
 'transito policia',
 'cluster gastronomia',
 'invias oficial',
 'el mundo rueda x se',
 'me paso a la fm',
 'se humano',
 'romero vive',
 'vamos millos a la final',
 'mejor periodo',
 'conductor agresivo',
 'caracolradio',
 'transmiseria',
 'no voy al andino',
 'transi le ni o',
 'el crimen del siglo',
 'viajar y ganar',
 'adoptada',
 'feliz martes',
 'chinche mejia',
 'americas',
 'mal parqueada',
 'ayuno',
 'se juega a esta hora',
 'transito bogota d',
 'los 120 segundos del gato',
 'en desarrollo',
 'jorge antonio vega',
 'cronicas transmi',
 'jota volatil',
 'reportan',
 'sitpbogota',
 'temprano es mas bacano',
 'noticiasrcn',
 'canalrcn',
 'rappi colombia',
 'aun tengo hambre',
 'cesar flechas',
 'noctambulo city',
 'el tiempo',
 'otto gerardo',
 'policiabogota',
 'vivian salazar',
 'peluches quique sam',
 'fontibon',
 'steven arce',
 'la calera',
 'tm ahora',
 'bogota se mueve',
 'noticias capita

In [1]:
import pandas as pd
#samples = pd.read_csv('samples_word_segmentation.csv')
samples = pd.read_csv('output-words-generator-v2.csv')
#samples[['original']]

In [12]:
#N = 2495613020 # spanish_count_1w_small
#N = 2497358193 # spanish_count_1w_small_v2 sin acento
N = 2575683488 # spanish_count_1w_small_v2 sin acento con Twitter
Pw  = Pdist(datafile('spanish_count_1w_small_v2_twitter.txt'), N, avoid_long_words)

In [14]:
result = []
for i in range(len(samples[['original']])):
    pre = segment(samples.iloc[i][0])
    text = ' '.join(pre)
    #result.append([samples.iloc[i][0],text])
    result.append(text)

TypeError: object of type 'numpy.int64' has no len()

In [8]:
result

['por la via x la vida',
 'otra periodista',
 'gen mas energia',
 'el divino nene',
 'transito policia',
 'cluster gastronomia',
 'invias oficial',
 'el mundo rueda x se',
 'me paso a la fm',
 'se humano',
 'romero vive',
 'vamos millos a la final',
 'mejor periodo',
 'conductor agresivo',
 'caracolradio',
 'transmiseria',
 'no voy al andino',
 'transi le ni o',
 'el crimen del siglo',
 'viajar y ganar',
 'adoptada',
 'feliz martes',
 'chinche mejia',
 'americas',
 'mal parqueada',
 'ayuno',
 'se juega a esta hora',
 'transito bogota d',
 'los 120 segundos del gato',
 'en desarrollo',
 'jorge antonio vega',
 'cronicas transmi',
 'jota volatil',
 'reportan',
 'sitpbogota',
 'temprano es mas bacano',
 'noticiasrcn',
 'canalrcn',
 'rappi colombia',
 'aun tengo hambre',
 'cesar flechas',
 'noctambulo city',
 'el tiempo',
 'otto gerardo',
 'policiabogota',
 'vivian salazar',
 'peluches quique sam',
 'fontibon',
 'steven arce',
 'la calera',
 'tm ahora',
 'bogota se mueve',
 'noticias capita

In [9]:
samples['v2_twitter'] = result

In [32]:
samples

Unnamed: 0,original,word_segmentation,Correct,v2
0,porlaviaxlavida,por la via x la vida,1,por la via x la vida
1,otraperiodista,otra periodista,1,otra periodista
2,genmasenergia,gen mas energia,1,gen mas energia
3,eldivinonene,el divino nene,1,el divino nene
4,transitopolicia,transito policia,1,transito policia
...,...,...,...,...
290,numeral767,numeral 767,1,numeral 767
291,movilidadinteligente,movilidad inteligente,1,movilidad inteligente
292,cali,cali,1,cali
293,aguantelaempanada,aguante la empanada,1,aguante la empanada


In [10]:
samples.to_csv('output-words-generator-v2-twitter.csv')

In [32]:
dataframe = pd.DataFrame(result,columns=['original','word_segmentation'])

In [33]:
dataframe

Unnamed: 0,original,word_segmentation
0,porlaviaxlavida,por la via x la vida
1,otraperiodista,otra periodista
2,genmasenergia,gen mas energia
3,eldivinonene,el divino nene
4,claudia14175900,claudia 14175900
...,...,...
708,aetapi,a etap i
709,fontib,fonti b
710,d,d
711,aguantelaempanada,aguante la empanada


In [34]:
dataframe.to_csv('words-generator.csv')

In [6]:
#N = 1911392132
#Pw  = Pdist(datafile('datos-colombia.txt'), N, avoid_long_words)
#N = 2495613020 # spanish_count_1w_small
N = 2497358193 # spanish_count_1w_small_v3 sin acento
Pw  = Pdist(datafile('spanish_count_1w_small_v2.txt'), N, avoid_long_words)

In [9]:
segment('accionpoetica')

['accion', 'poetica']

## 2nd Version
Bi-gram

In [29]:
def cPw(word, prev):
    "Conditional probability of word, given previous word."
    try:
        return P2w[prev + ' ' + word]/float(Pw[prev])
    except KeyError:
        return Pw(word)

@memo 
def segment2(text, prev='<S>'): 
    "Return (log P(words), words), where words is the best segmentation." 
    if not text: return 0.0, [] 
    candidates = [combine(log10(cPw(first, prev)), first, *segment2(rem, first)) 
                  for first,rem in splits(text)] 
    return max(candidates) 

def combine(Pfirst, first, Prem, rem): 
    "Combine first and rem results into one (probability, words) pair." 
    return Pfirst+Prem, [first]+rem 

In [31]:
P2w = Pdist(datafile('count_2w.txt'), N)

In [33]:
segment2('inaholeinthegroundtherelivedahobbitnotanastydirtywetholefilledwiththeendsofwormsandanoozysmellnoryetadrybaresandyholewithnothinginittositdownonortoeatitwasahobbitholeandthatmeanscomfort')

(-164.42574174168834,
 ['in',
  'a',
  'hole',
  'in',
  'the',
  'ground',
  'there',
  'lived',
  'a',
  'hobbit',
  'not',
  'a',
  'nasty',
  'dirty',
  'wet',
  'hole',
  'filled',
  'with',
  'the',
  'ends',
  'of',
  'worms',
  'and',
  'an',
  'oozy',
  'smell',
  'nor',
  'yet',
  'a',
  'dry',
  'bare',
  'sandy',
  'hole',
  'with',
  'nothing',
  'in',
  'it',
  'to',
  'sit',
  'down',
  'on',
  'or',
  'to',
  'eat',
  'it',
  'was',
  'a',
  'hobbit',
  'hole',
  'and',
  'that',
  'means',
  'comfort'])

In [36]:
segment2('wecouldincorporatemoredataandeitherkeepmoreentriesfromtheunigramorbigramdataorperhapsaddtrigramdata')

(-76.45118719416872,
 ['we',
  'could',
  'incorporate',
  'more',
  'data',
  'and',
  'either',
  'keep',
  'more',
  'entries',
  'from',
  'the',
  'unigram',
  'or',
  'bigram',
  'data',
  'or',
  'perhaps',
  'add',
  'trigram',
  'data'])

In [43]:
segment2('wonderworman')

(-12.205356599285366, ['wonder', 'worman'])