In [2]:
import pandas as pd
import spacy
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [5]:
en = spacy.load('en_core_web_sm')
STOPWORDS = en.Defaults.stop_words

noticias = pd.read_json('../model/News_Category_Dataset_v3.json', lines=True)
df = noticias.drop(columns=['authors','link','date'])
df

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...
...,...,...,...
209522,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...
209523,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr..."
209524,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked..."
209525,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...


In [6]:
#recursos necessários do NLTK
nltk.download('punkt')
nltk.download('stopwords')


# Pré-processamento de texto em inglês
stopwords = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords]
    return " ".join(tokens)


[nltk_data] Downloading package punkt to /home/anacleto/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anacleto/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
df['text'] = df['headline'] + " " + df['short_description']
df['text'] = df['text'].apply(preprocess_text)

In [8]:
# Vetorização dos textos usando TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])

In [9]:
# Codificação das categorias
category_mapping = {category: i for i, category in enumerate(df['category'])}
y = df['category'].map(category_mapping)



In [10]:
# Separação em dados de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)

In [12]:
# Predição e avaliação do modelo
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Acurácia do modelo:", accuracy)

Acurácia do modelo: 0.05734262396792822


In [13]:
print("Relatório de classificação:")
print(classification_report(y_test, y_pred))

Relatório de classificação:
              precision    recall  f1-score   support

        8673       0.00      0.00      0.00       269
       38431       0.48      0.02      0.03       665
       94811       0.48      0.02      0.03       741
       95466       0.00      0.00      0.00       275
      133328       0.31      0.01      0.03       270
      133341       0.25      0.00      0.01       238
      133537       0.41      0.02      0.03       550
      133590       0.11      0.01      0.01       263
      133618       0.27      0.04      0.07       464
      133622       0.50      0.02      0.04       532
      133628       0.62      0.06      0.11       530
      133638       0.73      0.04      0.07       209
      133639       0.09      0.00      0.01       202
      133647       0.37      0.02      0.04       534
      133648       0.52      0.03      0.06       795
      133651       0.09      0.00      0.01       293
      133657       0.82      0.03      0.07      7155