In [1]:
from google.colab import drive
drive.mount('/content/drive/')
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/U3 :: PLN')

Mounted at /content/drive/


In [14]:
# A review example on how 'CountVectorizer' works

from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = [ "The quick brown fox jumped over the lazy dog." ]

# create the vector object
vectorizer = CountVectorizer()

# tokenize and build the dictionary
vectorizer.fit(text)
print(vectorizer.vocabulary_)

# encode document
vector = vectorizer.transform(text)

# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

# test text
test = ["The lazy dog is sleeping."]
test_vectorized = vectorizer.transform(test)

# convert a 2-dimension array into 1-dimensional array
import numpy as np
test_vectorized_flatten = list(np.ravel(test_vectorized.toarray()))

# print the frequency of words in the test text
print("\nFrequency of words in the test text:")
for word, count in zip(vectorizer.get_feature_names_out(), test_vectorized_flatten):
   print("Word: {0:15} Count: {1:1}".format(word, count))



{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]

Frequency of words in the test text:
Word: brown           Count: 0
Word: dog             Count: 1
Word: fox             Count: 0
Word: jumped          Count: 0
Word: lazy            Count: 1
Word: over            Count: 0
Word: quick           Count: 0
Word: the             Count: 1


**1) Preparación del saco de palabras**

In [117]:
# 3. Importamos el vectorizador y el clasificador
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
 
# 2. Importamos los textos
from goldman_emma_raw import *
from henson_mathew_raw import *
from wu_tingfang_raw import *

friends_docs = goldman_docs + henson_docs + wu_docs

# 4. Texto de la carta
mystery_postcard = """
My friend,
From the 10th of July to the 13th, a fierce storm raged, clouds of
freeing spray broke over the ship, encasing her in a coat of icy mail,
and the tempest forced all of the ice out of the lower end of the
channel and beyond as far as the eye could see, but the _Roosevelt_
still remained surrounded by ice. There is little use for money up here, 
and the place is seldom visited.
Hope to see you soon. 
"""

# 5. Definimos el vectorizador
bow_vectorizer = CountVectorizer()

# 6. Definimos el vector BoW para el corpus
friends_vectors = bow_vectorizer.fit_transform(friends_docs)

# 7. Imprimimos diferentes aspectos
print(f'Dictionary: {bow_vectorizer.vocabulary_}')
print(f'Only words: {bow_vectorizer.get_feature_names_out()}')
print(f'\nMatrix shape of the corpus: {friends_vectors.shape}')
print(f'\nN-dimension vector of the corpus:\n {friends_vectors.toarray()}')
print(f'\n1-dimension vector for row 0 in the corpus:\n {friends_vectors.toarray().tolist()[0]}')

Dictionary: {'the': 3004, 'history': 1469, 'of': 2103, 'human': 1500, 'growth': 1385, 'and': 204, 'development': 862, 'is': 1653, 'at': 285, 'same': 2619, 'time': 3048, 'terrible': 2996, 'struggle': 2898, 'every': 1095, 'new': 2046, 'idea': 1516, 'heralding': 1453, 'approach': 234, 'brighter': 441, 'dawn': 783, 'in': 1542, 'its': 1660, 'tenacious': 2991, 'hold': 1473, 'on': 2115, 'tradition': 3075, 'old': 2113, 'has': 1415, 'never': 2044, 'hesitated': 1459, 'to': 3052, 'make': 1848, 'use': 3183, 'foulest': 1261, 'cruelest': 756, 'means': 1895, 'stay': 2860, 'advent': 138, 'whatever': 3281, 'form': 1250, 'or': 2132, 'period': 2223, 'latter': 1741, 'may': 1889, 'have': 1418, 'asserted': 274, 'itself': 1661, 'nor': 2059, 'need': 2033, 'we': 3263, 'retrace': 2550, 'our': 2150, 'steps': 2865, 'into': 1641, 'distant': 913, 'past': 2198, 'realize': 2445, 'enormity': 1051, 'opposition': 2127, 'difficulties': 876, 'hardships': 1407, 'placed': 2259, 'path': 2199, 'progressive': 2362, 'rack': 241

In [107]:
# 8. Mostramos las palabras del diccionario para la primera frase en el corpus con alguna ocurrencia (es decir, se prescinde de los términos (columnas) con valor 0)

print("\nFirst paragraph:\n" + friends_docs[0] + "\n")

friends_matrix_0 = pd.DataFrame(data=friends_vectors[0].toarray(), columns=bow_vectorizer.get_feature_names_out())
friends_matrix_0.loc[:, friends_matrix_0.any()]


First paragraph:

The history of human growth and development is at the same time the
history of the terrible struggle of every new idea heralding the
approach of a brighter dawn



Unnamed: 0,and,approach,at,brighter,dawn,development,every,growth,heralding,history,human,idea,is,new,of,same,struggle,terrible,the,time
0,1,1,1,1,1,1,1,1,1,2,1,1,1,1,4,1,1,1,5,1


**2) Identificación del remitente de la carta**

In [118]:
# 9. Definimos el vector BoW para el texto de la carta
mystery_vector = bow_vectorizer.transform(mystery_postcard.split("."))

# 10. Mostramos el BoW para el texto de la carta
mpm = pd.DataFrame(data=mystery_vector.toarray(), columns=bow_vectorizer.get_feature_names_out())

mystery_vector_matrix = mpm.loc[:, mpm.any()]
mystery_vector_matrix

Unnamed: 0,_roosevelt_,all,and,as,beyond,but,by,channel,could,end,...,soon,still,storm,the,there,to,up,use,visited,you
0,1,1,2,2,1,1,1,1,1,1,...,0,1,1,9,0,1,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
# 11. Creamos la variable "friends_classifier"
friends_classifier = MultinomialNB()

# 12. Etiquetamos los mensajes numéricamente para posibilitar el trabajo del clasificador
friends_labels = [1] * len(goldman_docs) + [2] * len(henson_docs) + [3] * len(wu_docs)

# 13. Entrenamos el clasificador
friends_classifier.fit(friends_vectors, friends_labels)
 
# 14-15. Hacemos la predicción
predictions = friends_classifier.predict(mystery_vector)

## Cuantas más líneas hayan asociadas a un amigo más probable es que el texto desconocido sea suyo
from collections import Counter
mystery_friend = Counter(predictions)

print("\nThe author of the postcard was from {}!\n".format(mystery_friend.most_common(1)[0][0]))


The author of the postcard was from 2!



In [139]:
# 16. Probabilidades de cada remitente para cada una de las frases
# https://dev.to/rajat_naegi/simply-explained-predictproba-263i

probs = friends_classifier.predict_proba(mystery_vector).tolist()

friends = predictions.tolist()
probs_max = [ max(prob) for prob in probs ]
friends_probs = list(zip(friends, probs_max))
print(f'Probabilities of each sentence to belong a to friend: {friends_probs}')

Probabilities of each sentence to belong a to friend: [(2, 0.9950505662612387), (2, 0.9657989978090034), (1, 0.6908844349859337), (3, 0.3616557734204794)]
