In [1]:
import sys
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Cargar el conjunto de datos
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

# Crear un DataFrame
data = {'text': newsgroups.data, 'target': newsgroups.target}
df = pd.DataFrame(data)
target_names = [newsgroups.target_names[i] for i in newsgroups.target]
df['category'] = target_names

In [3]:
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']
dff =  df[df["category"].isin(categories)]


In [4]:
dff

Unnamed: 0,text,target,category
9,From: arromdee@jyusenkyou.cs.jhu.edu (Ken Arro...,19,talk.religion.misc
10,From: sandvik@newton.apple.com (Kent Sandvik)\...,19,talk.religion.misc
12,From: Thyagi@cup.portal.com (Thyagi Morgoth Na...,19,talk.religion.misc
22,From: ruocco@ghost.dsi.unimi.it (sergio ruocco...,1,comp.graphics
25,From: prb@access.digex.com (Pat)\nSubject: Con...,14,sci.space
...,...,...,...
18816,From: kilroy@gboro.rowan.edu (Dr Nancy's Sweet...,15,soc.religion.christian
18821,From: J.Hale@latrobe.edu.au\nSubject: Re: Can ...,15,soc.religion.christian
18832,From: aa888@freenet.carleton.ca (Mark Baker)\n...,15,soc.religion.christian
18839,From: noye@midway.uchicago.edu (vera shanti no...,15,soc.religion.christian


In [5]:
#dff.drop(["target"], inplace=True) #Me quedo solo con category porque es NB
#Se debe vectorizar antes porque los features son dependientes de las palabras
vectorizer = TfidfVectorizer()
vdff = vectorizer.fit_transform(dff['text'])
X_train, X_test, y_train, y_test = train_test_split(vdff, dff['category'], test_size=0.2, random_state=0)


In [6]:
model = MultinomialNB()
model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)



0.8535564853556485

#### Para ejecutar todo el proceso de vectorizacion y probar con frases aleatorias me ayudo con un pipeline

In [7]:
# Cargar el conjunto de datos
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

# Crear un DataFrame
data = {'text': newsgroups.data, 'target': newsgroups.target}
df = pd.DataFrame(data)
target_names = [newsgroups.target_names[i] for i in newsgroups.target]
df['category'] = target_names

In [8]:
df['category'].unique() 

array(['rec.sport.hockey', 'comp.sys.ibm.pc.hardware',
       'talk.politics.mideast', 'comp.sys.mac.hardware',
       'sci.electronics', 'talk.religion.misc', 'sci.crypt', 'sci.med',
       'alt.atheism', 'rec.motorcycles', 'rec.autos', 'comp.windows.x',
       'comp.graphics', 'sci.space', 'talk.politics.guns', 'misc.forsale',
       'rec.sport.baseball', 'talk.politics.misc',
       'comp.os.ms-windows.misc', 'soc.religion.christian'], dtype=object)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], df['category'], test_size=0.2, random_state=0)
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

In [10]:
model.predict(["My wedding was great"])

array(['soc.religion.christian'], dtype='<U24')

In [11]:
model.predict(["The apollo mission fail"])

array(['sci.space'], dtype='<U24')

In [12]:
model.predict(["Maradona was the best soccer player ever"])

array(['rec.sport.hockey'], dtype='<U24')

In [13]:
model.predict(["portable computers are a really expensive thing"])

array(['comp.sys.mac.hardware'], dtype='<U24')

:(

In [14]:
model.predict(["I don't have a favourite car trademark"])

array(['rec.autos'], dtype='<U24')

In [15]:
model.predict(["Safe devices is a very import thing today"]) 

array(['sci.crypt'], dtype='<U24')