{# Opciones de extracción de características utilizando Stop_Words en español

### Documentos

In [1]:
texto = ['Este es el primer documento.',
        'Este documento es el segundo.',
        'Y este es el tercero.',
         '¿Es este el primer documento?']

In [2]:
mi_stop_words = ['este','es','el','y']

### Extracción con CountVectorizer
Determina una matriz de conteo

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
cv = CountVectorizer(stop_words=mi_stop_words)

In [5]:
mat_cv = cv.fit_transform(texto)

In [6]:
mat_cv

<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [7]:
mat_cv.todense()

matrix([[1, 1, 0, 0],
        [1, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 1, 0, 0]], dtype=int64)

In [8]:
mat_cv.shape

(4, 4)

In [9]:
cv.vocabulary_

{'primer': 1, 'documento': 0, 'segundo': 2, 'tercero': 3}

### Extracción con TfidfTransformer
TfidfTransformer se usa una matriz de conteo existente como la devuelta por CountVectorizer()

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
tfidf_tra = TfidfTransformer()

In [12]:
cv_tra = CountVectorizer(stop_words=mi_stop_words)

In [13]:
mat_tra = cv_tra.fit_transform(texto)

In [14]:
mat_tra

<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [15]:
mat_tra.todense()

matrix([[1, 1, 0, 0],
        [1, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 1, 0, 0]], dtype=int64)

In [16]:
tfidf = tfidf_tra.fit_transform(mat_tra)

In [17]:
tfidf.todense()

matrix([[0.62922751, 0.77722116, 0.        , 0.        ],
        [0.53802897, 0.        , 0.84292635, 0.        ],
        [0.        , 0.        , 0.        , 1.        ],
        [0.62922751, 0.77722116, 0.        , 0.        ]])

In [18]:
cv_tra.vocabulary_

{'primer': 1, 'documento': 0, 'segundo': 2, 'tercero': 3}

### Extracción con TfidfVectorizer
Hace lo mismo que los anteriores en un paso

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tfidf_v = TfidfVectorizer(stop_words=mi_stop_words)

In [21]:
mat_tfidf = tfidf_v.fit_transform(texto)

In [22]:
mat_tfidf

<4x4 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [23]:
mat_tfidf.todense()

matrix([[0.62922751, 0.77722116, 0.        , 0.        ],
        [0.53802897, 0.        , 0.84292635, 0.        ],
        [0.        , 0.        , 0.        , 1.        ],
        [0.62922751, 0.77722116, 0.        , 0.        ]])

In [24]:
tfidf_v.vocabulary_

{'primer': 1, 'documento': 0, 'segundo': 2, 'tercero': 3}

### Ejemplo del uso de pipelines

In [25]:
from sklearn.pipeline import Pipeline

In [26]:
canal = Pipeline([('cv2',CountVectorizer(stop_words=mi_stop_words)),
                 ('tfidf2',TfidfTransformer())])

In [27]:
resultados = canal.fit_transform(texto)

In [28]:
resultados

<4x4 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [29]:
resultados.todense()

matrix([[0.62922751, 0.77722116, 0.        , 0.        ],
        [0.53802897, 0.        , 0.84292635, 0.        ],
        [0.        , 0.        , 0.        , 1.        ],
        [0.62922751, 0.77722116, 0.        , 0.        ]])

In [30]:
cv_tra.vocabulary_

{'primer': 1, 'documento': 0, 'segundo': 2, 'tercero': 3}