In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Descargar recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RODRIGO_CRUZ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RODRIGO_CRUZ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Cargar el conjunto de datos
data = pd.read_csv(r'C:\Users\RODRIGO_CRUZ\Documents\Glish\Maestria\Tercer Cuatrimestre\Analitica de datos Masivos\Tareas\Unidad 3\Tarea 2 Análisis de Texto\newsCorpora-trimmed.csv') 
data.columns = ['category', 'text']

In [5]:
# Filtrar solo las categorías de interés
categories = ['b', 't', 'e', 'm']  # Business, Science and Technology, Entertainment, Health
data = data[data['category'].isin(categories)]

In [6]:
# Preprocesamiento de texto
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

data['tokens'] = data['text'].apply(preprocess_text)

In [7]:
# Crear documentos etiquetados
tagged_data = [TaggedDocument(words=row['tokens'], tags=[str(i)]) for i, row in data.iterrows()]

In [8]:
# Entrenar el modelo Doc2Vec
model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, workers=4, epochs=20)

In [9]:
# Convertir los documentos etiquetados a vectores
def vectorize_doc(doc):
    return model.infer_vector(doc.words)

data['vector'] = data['tokens'].apply(lambda x: vectorize_doc(TaggedDocument(x, [0])))

In [10]:
# Crear el conjunto de entrenamiento y prueba
X = list(data['vector'])
y = data['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Entrenar el clasificador de Árbol de Decisión
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [12]:
# Evaluar el clasificador
y_pred = classifier.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.3810086775589341
