# Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [329]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np

### Carga de datos

In [330]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

### Vectorización

In [331]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn
tfidfvect = TfidfVectorizer()

In [332]:
# en el atributo `data` accedemos al texto
newsgroups_train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [333]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador
# (obtener el vocabulario y calcular el vector IDF)
# y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [334]:
# recordar que las vectorizaciones por conteos son esparsas
# por ello sklearn convenientemente devuelve los vectores de documentos
# como matrices esparsas
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'cantidad de documentos: {X_train.shape[0]}')
print(f'tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')

<class 'scipy.sparse._csr.csr_matrix'>
shape: (11314, 101631)
cantidad de documentos: 11314
tamaño del vocabulario (dimensionalidad de los vectores): 101631


In [335]:
# una vez fiteado el vectorizador, podemos acceder a atributos como el vocabulario
# aprendido. Es un diccionario que va de términos a índices.
# El índice es la posición en el vector de documento.
tfidfvect.vocabulary_['car']

25775

In [336]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}
idx2word[25775]

'car'

In [337]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [338]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Similaridad de documentos

In [339]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

THE WHITE HOUSE

                  Office of the Press Secretary
                   (Pittsburgh, Pennslyvania)
______________________________________________________________
For Immediate Release                         April 17, 1993     

             
                  RADIO ADDRESS TO THE NATION 
                        BY THE PRESIDENT
             
                Pittsburgh International Airport
                    Pittsburgh, Pennsylvania
             
             
10:06 A.M. EDT
             
             
             THE PRESIDENT:  Good morning.  My voice is coming to
you this morning through the facilities of the oldest radio
station in America, KDKA in Pittsburgh.  I'm visiting the city to
meet personally with citizens here to discuss my plans for jobs,
health care and the economy.  But I wanted first to do my weekly
broadcast with the American people. 
             
             I'm told this station first broadcast in 1920 when
it reported that year's presidential elec

In [340]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [341]:
# podemos ver los valores de similaridad ordenados de mayor a menor
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ])

In [342]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  1534, 10055,  4750])

In [343]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [344]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [345]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


### Modelo de clasificación Naïve Bayes

In [346]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [347]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)

In [348]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1 = f1_score(y_test, y_pred, average='macro')
f1

0.5854345727938506

## Consigna del desafío 1

### **1**. Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos. Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido la similaridad según el contenido del texto y la etiqueta de clasificación.

Como se van a tomar documentos al azar, se quiere tener una reproducibilidad en las pruebas, por ello se va a usar un seed elegido arbitrariamente y se mantendrá en todo el trabajo.

In [349]:
# Se importa random
import random

In [350]:
# Se le da un seed arbitrario y se obtienen los índices obtenidos al azar
seed = 10
np.random.seed(seed)
random.seed(seed)
random_idx = []
for i in range(0,5):
    random_idx.append(random.randint(0,11314))
print(random_idx)

[9361, 533, 7026, 7906, 9471]


Una vez obtenidos los índice se hace el análsis para cada uno de ellos

#### Índice 9361

##### 1 - Se mide la similaridad con el resto de los documentos

In [351]:
# Se mide la similaridad coseno con todos los documentos de train
cossim0 = cosine_similarity(X_train[random_idx[0]], X_train)[0]
# Se guardan los valores de similaridad ordenados de mayor a menor
mostsim0 = np.argsort(cossim0)[::-1][1:6]

##### 2 - Se observa la similaridad con los demás documentos y a qué índice pertenecen estos.

In [352]:
# Se muestran los valores de similaridad obtenidos
print(np.sort(cossim0)[::-1][1:6])
# Se muestra a qué documentos corresponden
print(np.argsort(cossim0)[::-1][1:6])

[0.21649189 0.20618781 0.2029193  0.17855957 0.17843009]
[ 6201  8467 11137 10106  5989]


##### 3 - Se obtiene la clase del índice y también la de los de mayor similaridad.

In [353]:
# Se imprime la clase del documento
print("El documento original pertenece a la clase: ", newsgroups_train.target_names[y_train[random_idx[0]]])

# Se imprime la clase de los 5 más similares
print("Los demás documentos pertenecen a las clases: ")
for i in mostsim0:
  print(newsgroups_train.target_names[y_train[i]])

El documento original pertenece a la clase:  alt.atheism
Los demás documentos pertenecen a las clases: 
talk.religion.misc
rec.motorcycles
rec.motorcycles
soc.religion.christian
sci.crypt


##### 4 - Se observa el contenido del documento.

In [354]:
# Se imprime el documento del cuál se obtuve el índice al azar
print(newsgroups_train.data[random_idx[0]])


: 	Nice cop out bill.

I'm sure you're right, but I have no idea to what you refer. Would you
mind explaining how I copped out?


Se trata de un proyecto de ley.

In [355]:
print(newsgroups_train.data[mostsim0[0]])




You might be sure, but you would also be wrong.



Se trata de una frase con tendencia religiosa.

In [356]:
print(newsgroups_train.data[mostsim0[1]])


1) The next time you get stoped by a cop, never never never admit to anything.

2) Don't volunteer any information.

3) When a retoracle question is ask by the cop, like "...it <looked> like you were going kinda fast coming down highway 12.  You <must have> been going at least 70 or 75?" -- the correct reponse is to deny it. This technique is employed by police to help establish guilt, especially when (9 times out of 10) he/she is not sure who was doing the speeding. If the cop is unsure this may be the difference of him letting you off the hook or getting the tissue.

Hope this helps for next time.


Se trata de como evadir una multa de velocidad.

In [357]:
print(newsgroups_train.data[mostsim0[2]])


Right on, it is every citizen's right and duty to FORCE government
accountability.

(anecdotes deleted)


Also keep in mind that cops will LIE in court to get their way! (don't get
me started by asking how I know ;) If you decide to fight you have to be ready
for this as well as devise strategy to make the cop's story doubtful in the
judge/jury's mind.


Se trata de que dice que los policías mentirán para salirse con la suya y debes tenerlo en cuenta para idea una estrategia para desmentirlo.

In [358]:
print(newsgroups_train.data[mostsim0[3]])

[In looking through my files this weekend, I ran across some lyrics from
various rock groups that have content.  Here are two from Black Sabbath's
"Master of Reality".  I'll say this much for the music of the '60's and early
'70's, at least they asked questions of significance.  Jethro Tull is another
to asked and wrote about things that caused one to wonder. --Rex] 

AFTER FOREVER

Have you ever thought about your soul--
     can it be saved?
Or perhaps you think that when you're dead
     you just stay in you grave.
Is God just a thought within you read in a book
     when you were at school?
When you think about death 
     Do you lose your breath
     Or do you keep your cool?

Would you like to see the Pope on the end of a rope?
Do you think he's a fool?
Well I have seen the truth.  Yes I have seen the light
     and I've changed my ways.
And I'll be prepared 
     When you're lonely and scared
     at the end of your days.

Could it be you're afraid of what your friends might say

Se trata de una letra de una canción de Black Sabbath.

In [359]:
print(newsgroups_train.data[mostsim0[4]])



Could you expand on this? I have a feeling you're right, but I don't quite
understand.


Se pide que se amplié un tema que no se termina de entender.

Se observa que se tuvo una baja similaridad, por lo que las clases del documento obtenido al azar no se relaciona con las demás clases. Algunas palabras claves que se han observado en los documentos son cop. Esto se debe también a que el documento es realtivamente corto, por lo que no es muy representativo de la clase.

#### Índice 533

 Habla de un spinoff de ciencia ficción.

##### 1 - Se mide la similaridad con el resto de los documentos

In [360]:
# Se mide la similaridad coseno con todos los documentos de train
cossim1 = cosine_similarity(X_train[random_idx[1]], X_train)[0]
# Se guardan los valores de similaridad ordenados de mayor a menor
mostsim1 = np.argsort(cossim1)[::-1][1:6]

##### 2 - Se observa la similaridad con los demás documentos y a qué índice pertenecen estos.

In [361]:
# Se muestran los valores de similaridad obtenidos
print(np.sort(cossim1)[::-1][1:6])
# Se muestra a qué documentos corresponden
print(np.argsort(cossim1)[::-1][1:6])

[0.48255067 0.35558112 0.3520732  0.33276742 0.31324499]
[ 2061 10855  9934 11198  3285]


##### 3 - Se obtiene la clase del índice y también la de los de mayor similaridad.

In [362]:
# Se imprime la clase del documento
print("El documento original pertenece a la clase: ", newsgroups_train.target_names[y_train[random_idx[1]]])

# Se imprime la clase de los 5 más similares
print("Los demás documentos pertenecen a las clases: ")
for i in mostsim1:
  print(newsgroups_train.target_names[y_train[i]])

El documento original pertenece a la clase:  sci.space
Los demás documentos pertenecen a las clases: 
sci.space
sci.space
sci.space
sci.space
sci.space


##### 4 - Se observa el contenido del documento.

In [363]:
# Se imprime el documento del cuál se obtuve el índice al azar
print(newsgroups_train.data[random_idx[1]])

From the article "What's New" Apr-16-93 in sci.physics.research:

........
WHAT'S NEW (in my opinion), Friday, 16 April 1993  Washington, DC

1. SPACE BILLBOARDS! IS THIS ONE THE "SPINOFFS" WE WERE PROMISED?
In 1950, science fiction writer Robert Heinlein published "The
Man Who Sold the Moon," which involved a dispute over the sale of
rights to the Moon for use as billboard. NASA has taken the firsteps toward this
 hideous vision of the future.  Observers were
startled this spring when a NASA launch vehicle arrived at the
pad with "SCHWARZENEGGER" painted in huge block letters on the
side of the booster rockets.  Space Marketing Inc. had arranged
for the ad to promote Arnold's latest movie. Now, Space Marketing
is working with University of Colorado and Livermore engineers on
a plan to place a mile-long inflatable billboard in low-earth
orbit.  NASA would provide contractual launch services. However,
since NASA bases its charge on seriously flawed cost estimates
(WN 26 Mar 93) the taxp

Se trata de un spinoff de ciencia ficción.

In [364]:
print(newsgroups_train.data[mostsim1[0]])

;From the article "What's New" Apr-16-93 in sci.physics.research:
;
;........
;WHAT'S NEW (in my opinion), Friday, 16 April 1993  Washington, DC
;
;1. SPACE BILLBOARDS! IS THIS ONE THE "SPINOFFS" WE WERE PROMISED?
;What about light pollution in observations? (I read somewhere else that
;it might even be visible during the day, leave alone at night).
;Is NASA really supporting this junk?
;Are protesting groups being organized in the States?
;Really, really depressed.
;
;             Enzo

I wouldn't worry about it.  There's enough space debris up there that
a mile-long inflatable would probably deflate in some very short
period of time (less than a year) while cleaning up LEO somewhat.
Sort of a giant fly-paper in orbit.

Hmm, that could actually be useful.

As for advertising -- sure, why not?  A NASA friend and I spent one
drunken night figuring out just exactly how much gold mylar we'd need
to put the golden arches of a certain American fast food organization
on the face of the Moon.

Se trata de la primer parte de una spinoff y vuelve a figurar el nombre Enzo. 

In [365]:
print(newsgroups_train.data[mostsim1[1]])

Brian Yamauchi asks: [Regarding orbital billboards...]
  
    Well, I had been collecting data for next edition of the
Commercial Space News/Space Technology Investor... To summarize:
  
SPACE ADVERTISING
    First, advertising on space vehicles is not new -- it is very
common practice to put the cooperating organization's logos on the
space launch vehicle.  For example, the latest GPS launcher had the
(very prominent) logos on its side of
   - McDonnell Douglas (the Delta launcher)
   - Rockwell International (who built the GPS satellite)
   - USAF (who paid for the satellite and launch), and
   - the GPS/Navstar program office
   This has not been considered "paid advertising" but rather
"public relations", since the restrictions have been such that only
organizations involved in the launch could put their logos on the
side, and there was no money exchanged for this.  [However, putting
a 10' high logo on the side of the launch vehicle facing the cameras
is "advertising" as much as it

Vuelve a figurar el término Billboards. Se trata de los logos de las organizaciones que cooperaron en el vehículo espacial.

In [366]:
print(newsgroups_train.data[mostsim1[2]])

Two developments have brought these type of activities back to
the forefront in 1993.  First, in February, the Russians deployed a
20-m reflector from a Progress vehicle after it had departed from
the Mir Space Station.  While this "Banner" reflector was blank,
NPO Energia was very active in reporting that future  Banner
reflectors will be available to advertisers, who could use a space-
based video of their logo or ad printed on the Banner in a TV
commercial, as filmed from the Mir.
   The second development, has been that Space Marketing Inc, the
same company responsible for merchandising space on the Conestoga
booster and COMET spacecraft, is now pushing the "Environmental
Billboard".  As laid out by SMI Chief Engineer Dr Ron Humble of the
University of Colorado Space Laboratory and Preston Carter of the
Lawrence Livermore National Laboratory, the "Environmental
Billboard" is a large inflatable outer support structure of up to
804x1609 meters.  Advertising is carried by a mylar refl

Se trata de dos desarrollos para usar el espacio para publicidad.

In [367]:
print(newsgroups_train.data[mostsim1[3]])

Archive-name: space/controversy
Last-modified: $Date: 93/04/01 14:39:06 $

CONTROVERSIAL QUESTIONS

    These issues periodically come up with much argument and few facts being
    offered. The summaries below attempt to represent the position on which
    much of the net community has settled. Please DON'T bring them up again
    unless there's something truly new to be discussed. The net can't set
    public policy, that's what your representatives are for.


    WHAT HAPPENED TO THE SATURN V PLANS

    Despite a widespread belief to the contrary, the Saturn V blueprints
    have not been lost. They are kept at Marshall Space Flight Center on
    microfilm.

    The problem in re-creating the Saturn V is not finding the drawings, it
    is finding vendors who can supply mid-1960's vintage hardware (like
    guidance system components), and the fact that the launch pads and VAB
    have been converted to Space Shuttle use, so you have no place to launch
    from.

    By the time you 

Se trata que los planos del Saturn V no se ha perdido. Pero el problema es que no hay proveedores.

In [368]:
print(newsgroups_train.data[mostsim1[4]])

COMMERCIAL SPACE NEWS/SPACE TECHNOLOGY INVESTOR NUMBER 22

   This is number twenty-two in an irregular series on commercial 
space activities.  The commentaries included are my thoughts on 
these developments.  

   Sigh... as usual, I've gotten behind in getting this column 
written.  I can only plead the exigency of the current dynamics in 
the space biz.  This column is put together at lunch hour and after 
the house quiets down at night, so data can quickly build up if 
there's a lot of other stuff going on.  I've complied a lot of 
information and happenings since the last column, so I'm going to 
have to work to keep this one down to a readable length.  Have fun! 

CONTENTS:
1- US COMMERCIAL SPACE SALES FLATTEN IN 1993
2- DELTA WINS TWO KEY LAUNCH CONTRACTS
3- COMMERCIAL REMOTE SENSING VENTURE GETS DOC "GO-AHEAD"
4- INVESTMENT FIRM CALLS GD'S SPACE BIZ "STILL A GOOD INVESTMENT" 
5- ARIANE PREDICTS DIP IN LAUNCH DEMAND
6- NTSB INVESTIGATES PEGASUS LAUNCH OVER ABORTED ABORT
7- ANO

La noticia habla sobre actividades espaciales comerciales.

Se observa que se tuvo una mayor similaridad, tanto así que todos los documentos pertenecen a la misma clase.

### Índice 7026

##### 1 - Se mide la similaridad con el resto de los documentos

In [369]:
# Se mide la similaridad coseno con todos los documentos de train
cossim2 = cosine_similarity(X_train[random_idx[2]], X_train)[0]
# Se guardan los valores de similaridad ordenados de mayor a menor
mostsim2 = np.argsort(cossim2)[::-1][1:6]



##### 2 - Se observa la similaridad con los demás documentos y a qué índice pertenecen estos.

In [370]:
# Se muestran los valores de similaridad obtenidos
print(np.sort(cossim2)[::-1][1:6])
# Se muestra a qué documentos corresponden
print(np.argsort(cossim2)[::-1][1:6])

[0. 0. 0. 0. 0.]
[3712 3776 3775 3774 3773]


##### 3 - Se obtiene la clase del índice y también la de los de mayor similaridad.

In [371]:
# Se imprime la clase del documento
print("El documento original pertenece a la clase: ", newsgroups_train.target_names[y_train[random_idx[2]]])

# Se imprime la clase de los 5 más similares
print("Los demás documentos pertenecen a las clases: ")
for i in mostsim2:
  print(newsgroups_train.target_names[y_train[i]])

El documento original pertenece a la clase:  rec.sport.baseball
Los demás documentos pertenecen a las clases: 
rec.motorcycles
talk.politics.mideast
sci.electronics
soc.religion.christian
talk.religion.misc


##### 4 - Se observa el contenido del documento.

In [372]:
# Se imprime el documento del cuál se obtuve el índice al azar
print(newsgroups_train.data[random_idx[2]])




Se observa que el documento está vacío.

In [373]:
print(newsgroups_train.data[mostsim2[0]])



Again, from my alcohol server's class:
The absolute *most* that eating before drinking can do is slow the absorption
down by 15 minutes.  That gives me time to eat, slam one beer, and ride like
hell to try to make it home in the 10 minutes left after paying, donning 
helmet & gloves, starting bike...




Debido a que el documento del índice está vacío, el vector es perpendicular a este por lo que su similaridad es cero.

In [374]:
print(newsgroups_train.data[mostsim2[1]])

I will try to answer some of Dorin's questions, even though they were
not addressed to me specifically, but I feel that I am a bit concerned
by the thread since I am a Southern Lebanese from a village that is 
often on the receiving end of Israel's bombs.
In the first place the death of three soldiers on a patrol in occupied
Lebanese terrritory is NOT an act of terrorism or murder.  It is 
disingeneous to compare their death to that of athletes in Munich
or any other act of terrorism or mrder.  This exercise is aimed 
solely at diverting the issue and is far from the truth.
It seems to me, Dorin, that, you are so remote and ignorant of the problem
on the ground that your comments can only be charactrized as irrelevant,
and heavily colored by the preconceptions and misinformation.
I will try to paint the most accurate picture I can of
what the situation really is in South Lebanon.



I am.  I was back in my home village this last summer.  For your information
we are PEOPLE, not a bunch 

Debido a que el documento del índice está vacío, el vector es perpendicular a este por lo que su similaridad es cero.

In [375]:
print(newsgroups_train.data[mostsim2[2]])

{Michael Fulbright} said
   "Analog switches/Balanced"
      to <All> on 04-15-93  01:08
 MF> I am trying to build a synchronous demodulator and I've hit a snag.
 MF> In my application I want to be able to change the gain of an
 MF> op amp amplifier from 1 to -1, controlable via a digital input.
 MF> The most obvious way I've come up with is to use analog switches
 MF> to adjust the gain of the op amp. The only analog switch I have
 MF> experience with it the 4066. Unfortunately I want to switch an
 MF> AC signal which goes from about -5V to 5V, and the 4066 is only
 MF> for positive signals.

    How about using a 4053 it has a seperate ground for the
    analog outputs.  It would get you 3 bits.

 MF> Another part which caught my eye was the Analog Devices AD630. This
 MF> is a balanced demodulator which appears to fill exactly the need I
 MF> have. The data sheet was somewhat skimpy on application notes. Could
 MF> someone comment on using this chip for the following application?

 

Debido a que el documento del índice está vacío, el vector es perpendicular a este por lo que su similaridad es cero.

In [376]:
print(newsgroups_train.data[mostsim2[3]])



  In a word, yes.  I don't believe that physical knowledge has a great deal of
impact on the power of God.  In the past, God gave us the ability to create
life through sexual relations.  Now, he is giving us the ability to create life
through in vitro fertilization.  The difference between the two is merely 
cosmetic, and even if we gain the ability to create universes we won't begin to
approach the glory of God.
  The power we are being given is a test, and I am sure that in many cases we
will use our new abilities unwisely.  But, people have been using sexuality
unwisely for millenia and I haven't heard an outcry to abolish it yet!
  No matter how far we extend our dominion over the physical world, we aren't
impinging on God's power.  It's only when we attempt to gain control of the
spiritual world, those things that can't be approached through science and 
logic, that we begin to interfere with God.


Debido a que el documento del índice está vacío, el vector es perpendicular a este por lo que su similaridad es cero.

In [377]:
print(newsgroups_train.data[mostsim2[4]])

After tons of mail, could we move this discussion to alt.religion?
--There are many here among us who feel that life is but a joke. (Bob Dylan)
--"If you were happy every day of your life you wouldn't be a human
being, you'd be a game show host." (taken from the movie "Heathers.")
--Lecture (LEK chur) - process by which the notes of the professor
become the notes of the student without passing through the minds of
either.



Debido a que el documento del índice está vacío, el vector es perpendicular a este por lo que su similaridad es cero.

Todos los documentos tienen similaridad 0 con el documento del índice debido a que este está vacío.

### Índice 7906

##### 1 - Se mide la similaridad con el resto de los documentos

In [378]:
# Se mide la similaridad coseno con todos los documentos de train
cossim3 = cosine_similarity(X_train[random_idx[3]], X_train)[0]
# Se guardan los valores de similaridad ordenados de mayor a menor
mostsim3 = np.argsort(cossim3)[::-1][1:6]

##### 2 - Se observa la similaridad con los demás documentos y a qué índice pertenecen estos.

In [379]:
# Se muestran los valores de similaridad obtenidos
print(np.sort(cossim3)[::-1][1:6])
# Se muestra a qué documentos corresponden
print(np.argsort(cossim3)[::-1][1:6])

[0.32883652 0.25440234 0.21822036 0.18132624 0.178329  ]
[10994 10542 10435  4055  7177]


##### 3 - Se obtiene la clase del índice y también la de los de mayor similaridad.

In [380]:
# Se imprime la clase del documento
print("El documento original pertenece a la clase: ", newsgroups_train.target_names[y_train[random_idx[3]]])

# Se imprime la clase de los 5 más similares
print("Los demás documentos pertenecen a las clases: ")
for i in mostsim3:
  print(newsgroups_train.target_names[y_train[i]])

El documento original pertenece a la clase:  comp.sys.mac.hardware
Los demás documentos pertenecen a las clases: 
comp.sys.mac.hardware
soc.religion.christian
comp.sys.mac.hardware
comp.os.ms-windows.misc
sci.space


##### 4 - Se observa el contenido del documento.

In [381]:
# Se imprime el documento del cuál se obtuve el índice al azar
print(newsgroups_train.data[random_idx[3]])

What is hardware handshaking and when do I want to use it? Dan




Habla de un protocolo de enlace de hardware.

In [382]:
print(newsgroups_train.data[mostsim3[0]])


..



Hmmm... Sounds vaguely similar to a problem I had a long time ago when I was  
trying to use Kermit.  I was building a serial connection between my Duo 210  
and my NeXT.  I think the problem was in the handshaking.  Basically, you need  
to make sure that the handshaking protocol is the same on both sides.  A safe  
place to start is by selecting NO handshaking on either end.  One problem is  
that the Zilog serial chip seems to get permanently wedged if you talk to it  
wrong, and only a reset will clear it.  I don't know the specifics.  But this  
could be a nonlinearity that screws up your attempts at debugging the system.   
It could very well be that you are doing things right--eventually-- but one  
wrong move (like trying a bad handshaking protocol) can screw up any further  
correct actions, until the next machine reset.  I have wedged my Mac and also  
my NeXT that way.

Now I can send files back and forth between the Duo and the NeXT without any  
problem, and at pret

Se trata de una conexión serial y menciona el problema que tuvo en el protocolo de enlace.

In [383]:
print(newsgroups_train.data[mostsim3[1]])


Please, leave heaven out of it.  For his own sake, I pray that Dan does
take it literally because that's how God intended it to be taken.  Dan,
your view of many groups appears correct from my point of view.
However, I have found a group which is truly meeting requirements laid
down by the Bible on what it means to be a disciple of Jesus.  I have no
clue where wwc is, but please mail me.  I'd really like to get you in
touch with them.

[insert deletion of ranting about other religions which obviously has
gone off-center of Dan's original context]

Dan, I'm familiar with this one.  You've got a point, though.  There are
some who don't want to turn over everything and be a disciple, some have
no clue about it because they've not been taught, some have done exactly
that and turned over everything to follow Jesus, some are blocked by
difficult doctrine taught by uncaring Pharisees and teachers of the law.
However, Jesus pointed out what it takes to follow him and to be his
disciple in Luk

Se trata de que discute con Dan sobre la biblia. Se relaciona debido a que usa el mismo nombre, Dan.

In [384]:
print(newsgroups_train.data[mostsim3[2]])

I have tried almost everything under the sun to get a null modem connection
between a Mac Duo 210 and a PC. I have used MacKermit and VersaTerm on
the Mac side. I have used Procomm, Kermit, and Softerm (on OS/2) on
the PC (or PS) side. I have used non-Hardware handshaking and hardware
ahdshaking cables. And know MY hands are shaking from the effort. Nothing
has allowed file transfers from the Mac to the PS.

In general, I can type back and forth with no trouble, but only if both sides
are set to speeds OVER 9600 baud. I cannot send files from the Mac to the
PS at all, and file transfers from the Duo to the PS are not possible.
When I do a straight ascii send, I can send from the PS to the Duo flawlessly.
I can send Binhex files this way quite fast, and I know that the
transmission is error free.
But straight ascii sent from the Mac to the PS is full of errors.
Unless, of course, I do text pacing so slow that it is like human
typing. (well, like 2-3 times faster than human typing).

I w

Se trata de alguien que tiene un problema para establecer una conexión entre Mac Duo 210 y PC, que usó protocolos de enlace que no son de hardware.

In [385]:
print(newsgroups_train.data[mostsim3[3]])

I am getting Garbled output when serial printing thru Windows & works
etc.  This has occurred on several systems and goes if a LaserJet 4 is
used.  I suspect that there is no need for handshaking in this case due
to the capacity (memory/speed) of it.  There is no problem printing from
DOS.  Are there any obvious tweaks I'm missing.  I'm sure its not JUST
me with this problem.  Thanks for reading....  John Atherton









Se trata de que tiene problemas de impresión y que dice que sospecha que no es necesario el handshaking.

In [386]:
print(newsgroups_train.data[mostsim3[4]])

[Lots of trippy stuff deleted]

Wow...  What is this guy smoking and WHERE can I GET SOME?

Dan


Se pregunta Dan que fuma el otro sujeto.

Se observa que se tuvo la misma clases en documentos que mencionaban el handshaking, a excepción de uno pero es de una clase similar. En cambio al figurar el nombre Dan, se obtuvieron clases que poco tenían que ver con la del índice. Esto se debe también a que el documento es corto, por lo que no es muy representativo de la clase.

### Índice 9471

##### 1 - Se mide la similaridad con el resto de los documentos

In [387]:
# Se mide la similaridad coseno con todos los documentos de train
cossim4 = cosine_similarity(X_train[random_idx[4]], X_train)[0]
# Se guardan los valores de similaridad ordenados de mayor a menor
mostsim4 = np.argsort(cossim4)[::-1][1:6]

##### 2 - Se observa la similaridad con los demás documentos y a qué índice pertenecen estos.

In [388]:
# Se muestran los valores de similaridad obtenidos
print(np.sort(cossim4)[::-1][1:6])
# Se muestra a qué documentos corresponden
print(np.argsort(cossim4)[::-1][1:6])

[0.35296676 0.29738724 0.29628482 0.27876958 0.26617217]
[7967 4614 6151  324 7972]


##### 3 - Se obtiene la clase del índice y también la de los de mayor similaridad.

In [389]:
# Se imprime la clase del documento
print("El documento original pertenece a la clase: ", newsgroups_train.target_names[y_train[random_idx[4]]])

# Se imprime la clase de los 5 más similares
print("Los demás documentos pertenecen a las clases: ")
for i in mostsim4:
  print(newsgroups_train.target_names[y_train[i]])

El documento original pertenece a la clase:  comp.windows.x
Los demás documentos pertenecen a las clases: 
comp.windows.x
comp.windows.x
comp.windows.x
comp.windows.x
comp.windows.x


##### 4 - Se observa el contenido del documento.

In [390]:
# Se imprime el documento del cuál se obtuve el índice al azar
print(newsgroups_train.data[random_idx[4]])


No, it isn't.  It is the "X Window System", or "X11", or "X" or any of
a number of other designations accepted by the X Consortium.  In fact,
doing "man X" on pretty much any X11 machine will tell you:

     The X Consortium requests that the following names  be  used
     when referring to this software:

                                  X
                           X Window System
                             X Version 11
                     X Window System, Version 11
                                 X11

There is no such thing as "X Windows" or "X Window", despite the repeated
misuse of the forms by the trade rags.  This probably tells you something
about how much to trust the trade rags -- if they can't even get the NAME
of the window system right, why should one trust anything else they have 
to say?

With regard to dialup X11 implementations, there are several.  You can
buy serial X11 terminals from a couple of companies, including both 
GraphOn and NCD.  (In fact, I'm compos

Se trata de que no se debe llamar "X Windows" o "X Window" al sistema, sino "X Window System", "X11" o simplemente "X".

In [391]:
print(newsgroups_train.data[mostsim4[0]])

Archive-name: x-faq/part1
Last-modified: 1993/04/04

This article and several following contain the answers to some Frequently Asked 
Questions (FAQ) often seen in comp.windows.x. It is posted to help reduce 
volume in this newsgroup and to provide hard-to-find information of general 
interest.

		Please redistribute this article!

This article includes answers to the following questions, which are loosely
grouped into categories. Questions marked with a + indicate questions new to 
this issue; those with significant changes of content since the last issue are 
marked by !:

  0)  TOPIC: BASIC INFORMATION SOURCES AND DEFINITIONS
  1)! What books and articles on X are good for beginners?
  2)! What courses on X and various X toolkits are available?
  3)! What conferences on X are coming up?
  4)  What X-related public mailing lists are available?
  5)  How can I meet other X developers? 
  6)  What related FAQs are available?
  7)  How do I ask a net-question so as to maximize helpful r

Se trata preguntas y respuestas frecuentes de comp.windows.x.

In [392]:
print(newsgroups_train.data[mostsim4[1]])

Archive-name: x-faq/part5
Last-modified: 1993/04/04

----------------------------------------------------------------------
Subject: 119)  I'm writing a widget and can't use a float as a resource value.

Float resources are not portable; the size of the value may be larger than
the size of an XtPointer. Try using a pointer to a float instead; the Xaw
Scrollbar float resources are handled in this way. 

----------------------------------------------------------------------
Subject: 120)  Is this a memory leak in the X11R4 XtDestroyWidget()?!

Yes. This is the "unofficial" fix-19 for the X11R4 Destroy.c:

*** Destroy.c.1.37	Thu Jul 11 15:41:25 1991
--- lib/Xt/Destroy.c	Thu Jul 11 15:42:23 1991
***************
*** 1,4 ****
--- 1,5 ----
  /* $XConsortium: Destroy.c,v 1.37 90/09/28 10:21:32 swick Exp $ */
+ /* Plus unofficial patches in revisions 1.40 and 1.41 */
  
  /***********************************************************
  Copyright 1987, 1988 by Digital Equipment Corporation, Maynar

Se trata de preguntas y respuestas sobre X Windows System.

In [393]:
print(newsgroups_train.data[mostsim4[2]])

Archive-name: x-faq/part2
Last-modified: 1993/04/04

----------------------------------------------------------------------
Subject:  24)! How do I make a screendump or print my application?

	The xwd client in the X11 distributions can be used to select a window 
or the background. It produces an XWD-format file of the image of that window. 
The file can be post-processed into something useful or printed with the xpr 
client and your local printing mechanism. You can use this command:
		csh% sleep 10; xwd -root > output.xwd &
and then spend 10 seconds or so setting up your screen; the entire current
display will be saved into the file output.xwd. Note that xwd also has an
undocumented (before R5) -id flag for specifying the window id on the 
command-line. [There are also unofficial patches on export to xwd for 
specifying the delay and the portion of the screen to capture.]

	Two publicly-available programs which allow interactive definition of 
arbitrary portions of the display and b

Se trata de una pregunta y respuesta frecuente sobre X Windows System.

In [394]:
print(newsgroups_train.data[mostsim4[3]])

Hi!

I remember reading (or hallucinating) that NCD's PC-Xremote functionality had 
been given, by NCD, to MIT for inclusion in X11R6.  Is this true?  If so,
(set mode/cheap) can I just wait for X11R6 to get compressed serial line
X server support?

Thanks!


Se trata de una funcionalidad para PC-Xremote.

In [395]:
print(newsgroups_train.data[mostsim4[4]])

Archive-name: x-faq/part4
Last-modified: 1993/04/04

----------------------------------------------------------------------
Subject:  80)! Where can I get an X-based plotting program?

These usually are available from uucp sites such as uunet or other sites as
marked; please consult the archie server to find more recent versions.

 gnuplot	X (xplot), PostScript and a bunch of other drivers.
	export.lcs.mit.edu [and elsewhere]:contrib/gnuplot3.1.tar.Z

 gl_plot	X output only [?]
	comp.sources.unix/volume18

 graph+
	yallara.cs.rmit.oz.au:/pub/graph+.tar.Z [131.170.24.42]
	comp.sources.unix/volume8

 pdraw,drawplot		2D and 3D X,PS
	scam.berkeley.edu:/src/local/3dplot.tar.Z [128.32.138.1]
	scam.berkeley.edu:/src/local/contour.tar.Z [128.32.138.1]
	scam.berkeley.edu:/src/local/drawplot.tar.Z [128.32.138.1]
	uunet:~ftp/contrib/drawplot.tar.Z

 xgraph		plot, zoom. Outputs PS or HPGL.
	shambhala.berkeley.edu:/pub/xgraph-11.tar.Z [128.32.132.54]
	sun1.ruf.uni-freiburg.de:X11/contrib/xgraph-11.

Se trata también de una parte de preguntas y respuestas frecuentes sobre X Windows System.

Se observa que se tuvo una buena similaridad y que los 5 documentos más similares coinciden con la clase del documento del índice.

Se puedo comprobar como gracias a la vectorización se pueden encontrar otros documentos de la misma clase que el que tenemos el índice. Se pudo observar la dependencia del documento del cual se quiere conocer la similaridad con el resto, que debe ser representativo de su clase, ya que si es muy corto o tiene algún nombre puede dar valores de similaridad bajos.

### **2**. Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámetros de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial y ComplementNB.

In [396]:
# Se crea una lista con todos los cossim anteriormente obtenidos
cossim_vect1 = [cossim0, cossim1, cossim2, cossim3, cossim4]

### Vectorizamos

Se cambian algunos parámetros, entre ellos:
- max_df: quiere decir que cualquier palabra que aparezca en más del X% de los documentos será ignorada.
- min_df: quiere decir que cualquier palabra que aparezca en menos de X documentos será ignorada.
- stop_words: ayuda a eliminar palabras comunes del inglés que no son informativas y que no aportan valor.

Vectorizador 1: es el del ejercicio anterio, el cual usa solo parámetros por defecto.

In [397]:
# Se asigna el vectorizador el ejercicio 1 a otro con el número 1 en su nombre
tfidfvect1 = tfidfvect

Vectorizador 2:
- max_df = 0.95
- min_df = 5
- stop_words = 'english'

In [398]:
# Se instancia el vectorizador cambiando algunos parámetros
tfidfvect2 = TfidfVectorizer(max_df=0.95 ,min_df=5, stop_words='english')

# Se fitea el vectorizador y se tranforman los datos
X_train = tfidfvect2.fit_transform(newsgroups_train.data) #matriz documento-término

# Se obtienen los datos de y_train
y_train = newsgroups_train.target


In [399]:
cossim_vect2 = []
for i in range(0,5):   
    # Se mide la similaridad coseno con todos los documentos de train
    cossim = cosine_similarity(X_train[random_idx[i]], X_train)[0]
    cossim_vect2.append(cossim)
    
    # Se ven los valores de similaridad ordenados de mayor a menor
    mostsim = np.argsort(cossim)[::-1][1:6]
    
    # Se salta si i=2 debido a que estaba vacío ese documento
    if i == 2: continue
    
    # Se imprime la clase del documento
    print("El documento original pertenece a la clase: ", newsgroups_train.target_names[y_train[random_idx[i]]])

    # Se imprime la clase de los 5 más similares
    print("Los demás documentos pertenecen a las clases: ")
    for i in mostsim:
        print(newsgroups_train.target_names[y_train[i]])

El documento original pertenece a la clase:  alt.atheism
Los demás documentos pertenecen a las clases: 
rec.autos
rec.motorcycles
rec.motorcycles
rec.motorcycles
rec.motorcycles
El documento original pertenece a la clase:  sci.space
Los demás documentos pertenecen a las clases: 
sci.space
sci.space
sci.space
sci.space
sci.space
El documento original pertenece a la clase:  comp.sys.mac.hardware
Los demás documentos pertenecen a las clases: 
comp.sys.mac.hardware
sci.electronics
comp.sys.mac.hardware
soc.religion.christian
comp.os.ms-windows.misc
El documento original pertenece a la clase:  comp.windows.x
Los demás documentos pertenecen a las clases: 
comp.windows.x
comp.windows.x
comp.windows.x
comp.os.ms-windows.misc
comp.windows.x


Vectorizador 3:
- max_df = 0.75
- min_df = 3

In [400]:
# Se instancia el vectorizador cambiando algunos parámetros
tfidfvect3 = TfidfVectorizer(max_df=0.75 ,min_df=3)
vectorizadores = [tfidfvect , tfidfvect2, tfidfvect3]
# Se fitea el vectorizador y se tranforman los datos
X_train = tfidfvect3.fit_transform(newsgroups_train.data) #matriz documento-término

# Se obtienen los datos de y_train
y_train = newsgroups_train.target


In [401]:
cossim_vect3 = []
for i in range(0,5):   
    # Se mide la similaridad coseno con todos los documentos de train
    cossim = cosine_similarity(X_train[random_idx[i]], X_train)[0]
    cossim_vect3.append(cossim)
    
    # Se ven los valores de similaridad ordenados de mayor a menor
    mostsim = np.argsort(cossim)[::-1][1:6]
    
    # Se salta si i=2 debido a que estaba vacío ese documento
    if i == 2: continue
    
    # Se imprime la clase del documento
    print("El documento original pertenece a la clase: ", newsgroups_train.target_names[y_train[random_idx[i]]])

    # Se imprime la clase de los 5 más similares
    print("Los demás documentos pertenecen a las clases: ")
    for i in mostsim:
        print(newsgroups_train.target_names[y_train[i]])

El documento original pertenece a la clase:  alt.atheism
Los demás documentos pertenecen a las clases: 
alt.atheism
rec.motorcycles
talk.religion.misc
rec.motorcycles
talk.politics.guns
El documento original pertenece a la clase:  sci.space
Los demás documentos pertenecen a las clases: 
sci.space
sci.space
sci.space
sci.space
sci.space
El documento original pertenece a la clase:  comp.sys.mac.hardware
Los demás documentos pertenecen a las clases: 
comp.sys.mac.hardware
soc.religion.christian
comp.sys.mac.hardware
sci.space
sci.electronics
El documento original pertenece a la clase:  comp.windows.x
Los demás documentos pertenecen a las clases: 
comp.windows.x
comp.windows.x
comp.windows.x
comp.windows.x
comp.windows.x


Se realiza la comparación de los valores de similaridad coseno obtenidos según el vectorizador usado.

In [402]:
# Se imprime la comparación de los vectorizadores
for i in range(0,5): 
    if i == 2: continue
    print("Mayores valores de cossim para el índice ",random_idx[i]," del vectorizador 1 son:", np.sort(cossim_vect1[i])[::-1][1:6])  
    print("Mayores valores de cossim para el índice ",random_idx[i]," del vectorizador 2 son:", np.sort(cossim_vect2[i])[::-1][1:6]) 
    print("Mayores valores de cossim para el índice ",random_idx[i]," del vectorizador 3 son:", np.sort(cossim_vect3[i])[::-1][1:6],"\n") 

Mayores valores de cossim para el índice  9361  del vectorizador 1 son: [0.21649189 0.20618781 0.2029193  0.17855957 0.17843009]
Mayores valores de cossim para el índice  9361  del vectorizador 2 son: [0.28308214 0.2713733  0.23773481 0.2318903  0.20373597]
Mayores valores de cossim para el índice  9361  del vectorizador 3 son: [0.26402646 0.2514816  0.24529833 0.23044104 0.21103692] 

Mayores valores de cossim para el índice  533  del vectorizador 1 son: [0.48255067 0.35558112 0.3520732  0.33276742 0.31324499]
Mayores valores de cossim para el índice  533  del vectorizador 2 son: [0.48097075 0.28943091 0.2849633  0.27137833 0.23707095]
Mayores valores de cossim para el índice  533  del vectorizador 3 son: [0.47245711 0.32003297 0.30900468 0.28793802 0.28433621] 

Mayores valores de cossim para el índice  7906  del vectorizador 1 son: [0.32883652 0.25440234 0.21822036 0.18132624 0.178329  ]
Mayores valores de cossim para el índice  7906  del vectorizador 2 son: [0.38773484 0.21389348 0

Se observa que al modificar los parámetros de los vectorizadores se incrementan los valores de similaridad obtenidos para los índices 9361 y 7906. Mientras que en 533 se mantuvieron similares pero inferiores y en el caso de 9471 son inferiores. Cabe destacar que no se mostró el índice 7026 debido a que este está vacío y da un valor de similaridad de cero con los demás documentos.

Se van a evaluar los dos modelos cambiando el valor del parámetros alpha, donde alpha por defecto es igual a 1. Este parámetros es el coeficiente de suavizado de Laplace (o suavizado de Lidstone), y es usado para evitar probabilidades cero en los cálculos de probabilidad de características.

In [403]:
alphas = [0.1, 0.25, 0.5, 0.75, 1.0]

#### MultinomialNB

In [404]:
best_f1_MNB = 0
best_vect_MNB = None
for vect in vectorizadores:
    print("Con el vectorizador ", vect)
    # Se toma el X_train de cada vectorizador 
    X_train = vect.fit_transform(newsgroups_train.data) #matriz documento-término

    # Se obtienen los datos de y_train
    y_train = newsgroups_train.target

    for i in alphas:
        # Se instancia el modelo de clasificación Naïve Bayes
        clf = MultinomialNB(alpha=i)

        # Se entrena con sklearn
        clf.fit(X_train, y_train)

        # Con el vectorizador ya fiteado en train, se vectorizan los textos del conjunto de test
        X_test = vect.transform(newsgroups_test.data)

        # Se obtienen los datos de y_test
        y_test = newsgroups_test.target

        # Se realiza la predicción
        y_pred =  clf.predict(X_test)

        # Se calcula el F1-score
        f1_MNB = f1_score(y_test, y_pred, average='macro')

        # Se muestra el valor de F1-score
        print(" F1-score con MultinomialNB usando un alpha de ", i ," es ", f1_MNB)

        if best_f1_MNB < f1_MNB:
            best_f1_MNB = f1_MNB
            best_vect_MNB = vect
    print("")

Con el vectorizador  TfidfVectorizer()
 F1-score con MultinomialNB usando un alpha de  0.1  es  0.6564514103512165
 F1-score con MultinomialNB usando un alpha de  0.25  es  0.6364435869212646
 F1-score con MultinomialNB usando un alpha de  0.5  es  0.615341523969213
 F1-score con MultinomialNB usando un alpha de  0.75  es  0.5992924448379469
 F1-score con MultinomialNB usando un alpha de  1.0  es  0.5854345727938506

Con el vectorizador  TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')
 F1-score con MultinomialNB usando un alpha de  0.1  es  0.6772332285106473
 F1-score con MultinomialNB usando un alpha de  0.25  es  0.6699565732134729
 F1-score con MultinomialNB usando un alpha de  0.5  es  0.6611197689233894
 F1-score con MultinomialNB usando un alpha de  0.75  es  0.6556786411458713
 F1-score con MultinomialNB usando un alpha de  1.0  es  0.6503653727588931

Con el vectorizador  TfidfVectorizer(max_df=0.75, min_df=3)
 F1-score con MultinomialNB usando un alpha de  0.1  e

#### ComplementNB

In [405]:
best_f1_CNB = 0
best_vect_CNB = None
for vect in vectorizadores:
    print("Con el vectorizador ", vect)
    # Se toma el X_train de cada vectorizador 
    X_train = vect.fit_transform(newsgroups_train.data) #matriz documento-término

    # Se obtienen los datos de y_train
    y_train = newsgroups_train.target
    
    for i in alphas:
        # Se instancia el modelo de clasificación Naïve Bayes
        clf = ComplementNB(alpha=i)

        # Se entrena con sklearn
        clf.fit(X_train, y_train)

        # Con el vectorizador ya fiteado en train, se vectorizan los textos del conjunto de test
        X_test = vect.transform(newsgroups_test.data)

        # Se obtienen los datos de y_test
        y_test = newsgroups_test.target

        # Se realiza la predicción
        y_pred =  clf.predict(X_test)

        # Se calcula el F1-score
        f1_CNB = f1_score(y_test, y_pred, average='macro')

        # Se muestra el valor de F1-score
        print(" F1-score con ComplementNB usando un alpha de ", i ," es ", f1_CNB)

        if best_f1_CNB < f1_CNB:
            best_f1_CNB = f1_CNB
            best_vect_CNB = vect
    print("")

Con el vectorizador  TfidfVectorizer()
 F1-score con ComplementNB usando un alpha de  0.1  es  0.6953652590540836
 F1-score con ComplementNB usando un alpha de  0.25  es  0.7000110030082339
 F1-score con ComplementNB usando un alpha de  0.5  es  0.6961156947315815
 F1-score con ComplementNB usando un alpha de  0.75  es  0.6942657808599624
 F1-score con ComplementNB usando un alpha de  1.0  es  0.692953349950875

Con el vectorizador  TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')
 F1-score con ComplementNB usando un alpha de  0.1  es  0.674494033890231
 F1-score con ComplementNB usando un alpha de  0.25  es  0.679699282026329
 F1-score con ComplementNB usando un alpha de  0.5  es  0.6826719926096569
 F1-score con ComplementNB usando un alpha de  0.75  es  0.6826053016921679
 F1-score con ComplementNB usando un alpha de  1.0  es  0.6822855200322546

Con el vectorizador  TfidfVectorizer(max_df=0.75, min_df=3)
 F1-score con ComplementNB usando un alpha de  0.1  es  0.68413777

Se comparan los mejores resultados obtenidos para MultinomialNB y ComplementNB.

In [406]:
print("El mejor valor de F1-score obtenido para MultinomialNB es de: ", best_f1_MNB)
print("El mejor valor de F1-score obtenido para ComplementNB es de: ", best_f1_CNB)

best_vect = None
if best_f1_MNB >= best_f1_CNB:
    best_vect = best_vect_MNB
else:
    best_vect = best_vect_CNB

El mejor valor de F1-score obtenido para MultinomialNB es de:  0.6772332285106473
El mejor valor de F1-score obtenido para ComplementNB es de:  0.7000110030082339


Se puede observar que el valor obtenido con ComplementNB es superior al de MultinomialNB, lo cual era de esperarse ya que el dataset tiene las clases desbalanceadas, por lo que ComplementNB funciona un poco mejor.


### **3**. Transponer la matriz documento-término. De esa manera se obtiene una matriz término-documento que puede ser interpretada como una colección de vectorización de palabras. Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares.

In [407]:
# Se obtiene la matriz documento-término
X_train = best_vect.fit_transform(newsgroups_train.data) 

# Se transpone la matriz documento-término para tener la matriz término-documento
termino_documento = np.transpose(X_train)

Del ejercicio 1, se selecciononaron palabras del documento 533 para realizar este ejercicio, estas son:
- science
- moon
- ozone
- movie
- space

In [408]:
selected_words = ['science','moon','ozone','orbit','sold']

In [409]:
for j in range(0,5):
    # Se obtiene el índice del término
    vocabulary_index = best_vect.vocabulary_[selected_words[j]]

    # Se mide la similaridad coseno con todos los términos de train
    cossim = cosine_similarity(termino_documento[vocabulary_index],termino_documento)[0]

    # Se obtienen los índices de las palabras más similares
    mostsim_index = np.argsort(cossim)[::-1][1:6]

    # Se obtienen las palabras más similares y las similitudes
    mostsim = [(tfidfvect1.get_feature_names_out()[i], cossim[i]) for i in mostsim_index]

    # Se muestran las palabras más similares
    print("Palabras más similares a ", selected_words[j],":")
    for word, similarity in mostsim:
        print(f"  {word}: {similarity}")


Palabras más similares a  science :
  behaviorists: 0.39476373764211264
  cognitivists: 0.39476373764211264
  scientific: 0.3465905773254702
  empirical: 0.2810294376187423
  sects: 0.26217718199355267
Palabras más similares a  moon :
  lunar: 0.3358398549048778
  phases: 0.32230348293137306
  atraction: 0.25973081883685933
  sattellite: 0.25973081883685933
  gravitacional: 0.25973081883685933
Palabras más similares a  ozone :
  uars: 0.47835553575467377
  5011: 0.4591281013093219
  depletion: 0.45324208748696165
  stratosphere: 0.41667472756630386
  correlative: 0.40088523705950924
Palabras más similares a  orbit :
  hiten: 0.4607357177689473
  hagoromo: 0.3671740079209039
  lunar: 0.35166421155955674
  subsatellite: 0.3413063621867869
  flybys: 0.33568964310492555
Palabras más similares a  sold :
  thas: 0.30820935132616806
  discouragement: 0.19922249438764306
  inquiries: 0.19656848946582606
  remains: 0.19496634364489104
  depreciated: 0.19201926911046568


Se observa que de las 5 palabras elegidas, cada una tuvo 5 palabras con alta similitud que se relacionaban con estas, comprobando una vez más la utilidad de la similitud coseno.