# Extracción de Características desde Texto

## Extracción Manual

In [1]:
with open('One.txt') as texto:
    print(texto.read())

This is a story about dogs
our canine pets
Dogs are furry animals



In [2]:
with open('Two.txt') as texto:
    print(texto.read())

This story is about surfing
Catching waves is fun
Surfing is a popular water sport



Leer los datos completos como cadena

In [3]:
with open('One.txt') as texto:
    texto_completo = texto.read()

In [4]:
texto_completo

'This is a story about dogs\nour canine pets\nDogs are furry animals\n'

In [5]:
print(texto_completo)

This is a story about dogs
our canine pets
Dogs are furry animals



Obtener palabras separadas

In [6]:
with open('One.txt') as texto:
    palabras = texto.read().lower().split()

In [7]:
palabras

['this',
 'is',
 'a',
 'story',
 'about',
 'dogs',
 'our',
 'canine',
 'pets',
 'dogs',
 'are',
 'furry',
 'animals']

Construcción de vocabulario

In [8]:
with open('One.txt') as texto:
    palabras_one = texto.read().lower().split()

In [9]:
len(palabras_one)

13

In [10]:
uni_palabras_one = set(palabras_one)

In [11]:
uni_palabras_one

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'dogs',
 'furry',
 'is',
 'our',
 'pets',
 'story',
 'this'}

In [12]:
len(uni_palabras_one)

12

In [13]:
with open('Two.txt') as texto:
    palabras_two = texto.read().lower().split()
    uni_palabras_two = set(palabras_two)

In [14]:
uni_palabras_two

{'a',
 'about',
 'catching',
 'fun',
 'is',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

Obtener palabras únicas en todos los documentos

In [15]:
pal_uni_ambas = set()
pal_uni_ambas.update(uni_palabras_one)
pal_uni_ambas.update(uni_palabras_two)

In [16]:
pal_uni_ambas

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'catching',
 'dogs',
 'fun',
 'furry',
 'is',
 'our',
 'pets',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [17]:
vocabulario = dict()
i=0
for palabra in pal_uni_ambas:
    vocabulario[palabra] = i
    i=i+1

In [18]:
vocabulario

{'fun': 0,
 'furry': 1,
 'are': 2,
 'popular': 3,
 'water': 4,
 'our': 5,
 'sport': 6,
 'waves': 7,
 'dogs': 8,
 'story': 9,
 'pets': 10,
 'this': 11,
 'about': 12,
 'surfing': 13,
 'a': 14,
 'canine': 15,
 'is': 16,
 'animals': 17,
 'catching': 18}

Conteo de Frecuencia de cada palabra

In [None]:
# Crear vectores inicializados en cero para contar las palabras

In [20]:
frec_one = [0]*len(vocabulario)
frec_two = [0]*len(vocabulario)
palabras_todas = ['']*len(vocabulario)

In [21]:
print(frec_one)
print(frec_two)
print(palabras_todas)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [22]:
for palabra in vocabulario:
    indice_palabra = vocabulario[palabra]
    palabras_todas[indice_palabra] = palabra

In [23]:
print(palabras_todas)

['fun', 'furry', 'are', 'popular', 'water', 'our', 'sport', 'waves', 'dogs', 'story', 'pets', 'this', 'about', 'surfing', 'a', 'canine', 'is', 'animals', 'catching']


In [24]:
# Contar las palabras de cada documento
with open('One.txt') as texto:
    texto_one = texto.read().lower().split()

for palabra in texto_one:
    indice_palabra = vocabulario[palabra]
    frec_one[indice_palabra] = frec_one[indice_palabra]+1

In [25]:
print(frec_one)

[0, 1, 1, 0, 0, 1, 0, 0, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0]


In [26]:
# Contar las palabras de cada documento
with open('Two.txt') as texto:
    texto_two = texto.read().lower().split()

for palabra in texto_two:
    indice_palabra = vocabulario[palabra]
    frec_two[indice_palabra] = frec_two[indice_palabra]+1

In [27]:
print(frec_two)

[1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 2, 1, 0, 3, 0, 1]


In [28]:
import pandas as pd

In [29]:
pd.DataFrame(data=[frec_one,frec_two],columns=palabras_todas)

Unnamed: 0,fun,furry,are,popular,water,our,sport,waves,dogs,story,pets,this,about,surfing,a,canine,is,animals,catching
0,0,1,1,0,0,1,0,0,2,1,1,1,1,0,1,1,1,1,0
1,1,0,0,1,1,0,1,1,0,1,0,1,1,2,1,0,3,0,1


## Extracción con herramientas scikit_learn

In [30]:
texto = ['This is a line',
        'This is another line',
        'Completely different line']

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
vc = CountVectorizer()

In [33]:
vc.fit_transform(texto)

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [34]:
mat_disp = vc.fit_transform(texto)

In [35]:
mat_disp

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [36]:
mat_disp.todense()

matrix([[0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 1, 1],
        [0, 1, 1, 0, 1, 0]], dtype=int64)

In [37]:
vc.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}

Con stop words

In [38]:
vc = CountVectorizer(stop_words='english')

In [39]:
vc.fit_transform(texto).todense()

matrix([[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1]], dtype=int64)

In [40]:
vc.vocabulary_

{'line': 2, 'completely': 0, 'different': 1}

Uso de TfidfTransformer

In [41]:
transformador_tfidf = TfidfTransformer()

In [42]:
vc = CountVectorizer()

In [43]:
conteo = vc.fit_transform(texto)

In [44]:
conteo

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [45]:
conteo.todense()

matrix([[0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 1, 1],
        [0, 1, 1, 0, 1, 0]], dtype=int64)

In [46]:
tfidf = transformador_tfidf.fit_transform(conteo)

In [47]:
tfidf

<3x6 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [48]:
tfidf.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

In [49]:
vc.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}