In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

# Corpus etiquetado

In [2]:
etiquetas = ["shakespeare_the_merchant_of_venice", 
             "shakespeare_romeo_juliet", 
             "shakespeare_hamlet", 
             "dickens_a_christmas_carol", 
             "dickens_oliver_twist",
             "dickens_a_tale_of_two_cities"]
corpus = []

for etiqueta in etiquetas:
  archivo = open(f"data/{etiqueta}.txt", "r")
  corpus.append(archivo.read())
  archivo.close()

etiquetas = ["shakespeare_the_merchant_of_venice", 
             "shakespeare_romeo_juliet", 
             "shakespeare_hamlet", 
             "dickens_a_christmas_carol", 
             "dickens_oliver_twist",
             "dickens_a_tale_of_two_cities"]

corpus = np.array(corpus)
df_corpus = pd.DataFrame({"documento": corpus, 
                          "categoria": etiquetas})
df_corpus

Unnamed: 0,documento,categoria
0,"project,gutenberg,ebook,merchant,venice,willia...",shakespeare_the_merchant_of_venice
1,"project,gutenberg,ebook,romeo,juliet,william,s...",shakespeare_romeo_juliet
2,"project,gutenberg,ebook,hamlet,william,shakesp...",shakespeare_hamlet
3,"project,gutenberg,ebook,christmas,carol,charle...",dickens_a_christmas_carol
4,"project,gutenberg,ebook,oliver,twist,charles,d...",dickens_oliver_twist
5,"project,gutenberg,ebook,tale,two,city,charles,...",dickens_a_tale_of_two_cities


# Modelo de Bolsa de Palabras

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# bolsa de palabras en matriz dispersa
count_vectorizer = CountVectorizer(min_df=0.0, max_df=1.0)
matriz_conteo = count_vectorizer.fit_transform(corpus)
matriz_conteo

<6x15164 sparse matrix of type '<class 'numpy.int64'>'
	with 30424 stored elements in Compressed Sparse Row format>

In [4]:
# ver valores diferentes de cero en la matriz dispersa
print(matriz_conteo)

  (0, 10315)	90
  (0, 6108)	99
  (0, 4317)	20
  (0, 8386)	20
  (0, 14294)	45
  (0, 14871)	4
  (0, 11781)	5
  (0, 1652)	8
  (0, 8214)	17
  (0, 11189)	18
  (0, 59)	2
  (0, 7745)	36
  (0, 13227)	12
  (0, 567)	5
  (0, 7486)	1
  (0, 1783)	11
  (0, 9390)	5
  (0, 1065)	1
  (0, 4611)	1
  (0, 6611)	1
  (0, 14849)	1
  (0, 183)	1
  (0, 13475)	8
  (0, 85)	7
  (0, 1710)	6
  :	:
  (5, 8286)	1
  (5, 10631)	1
  (5, 4475)	1
  (5, 5110)	1
  (5, 7418)	1
  (5, 2381)	1
  (5, 8105)	1
  (5, 829)	1
  (5, 2223)	1
  (5, 3206)	1
  (5, 3277)	1
  (5, 1047)	1
  (5, 4837)	1
  (5, 6348)	1
  (5, 244)	2
  (5, 11380)	1
  (5, 9754)	1
  (5, 14795)	1
  (5, 13303)	1
  (5, 13858)	1
  (5, 8390)	1
  (5, 11826)	1
  (5, 9584)	1
  (5, 4822)	1
  (5, 696)	1


In [5]:
# ver la representación densa
matriz_conteo = matriz_conteo.toarray()
matriz_conteo

array([[1, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 1, 0, 2],
       [3, 0, 1, ..., 2, 1, 0],
       [1, 1, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [15]:
# obten todas las palabras únicas del corpus
vocabulario = count_vectorizer.get_feature_names_out()
# muestra los vectores de características del documento
pd.DataFrame(matriz_conteo, columns=vocabulario, index=etiquetas)

Unnamed: 0,000,00033,04,08,0em,10,100,101,102,103,...,yourn,youth,youthful,youthfulness,zeal,zealous,zenith,zip,zone,zounds
shakespeare_the_merchant_of_venice,1,0,0,0,2,0,0,0,0,0,...,0,8,1,0,1,0,0,0,0,0
shakespeare_romeo_juliet,3,0,0,1,0,2,1,0,0,0,...,0,6,3,0,0,0,0,1,0,2
shakespeare_hamlet,3,0,1,0,0,2,3,0,0,0,...,0,14,0,0,0,0,0,2,1,0
dickens_a_christmas_carol,1,1,0,0,0,2,2,1,1,1,...,0,1,0,0,1,0,0,1,0,0
dickens_oliver_twist,1,0,0,0,2,0,0,0,0,0,...,0,9,6,0,0,2,1,0,0,0
dickens_a_tale_of_two_cities,1,0,0,0,0,2,0,0,0,0,...,1,10,3,1,0,2,0,1,0,0


# Vocabulario (número de columnas de la matriz)

In [7]:
print(len(vocabulario), vocabulario)

15164 ['000' '00033' '04' ... 'zip' 'zone' 'zounds']


In [14]:
np.savetxt('data/vocabulario.txt', vocabulario, fmt='%s')

In [16]:
print(etiquetas)

['shakespeare_the_merchant_of_venice', 'shakespeare_romeo_juliet', 'shakespeare_hamlet', 'dickens_a_christmas_carol', 'dickens_oliver_twist', 'dickens_a_tale_of_two_cities']
