In [9]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from xml.etree import ElementTree as ET
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import RSLPStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dezin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dezin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
import os
import requests
import zipfile

url = 'https://www.dropbox.com/scl/fo/2vh6qw9x2ae8zoma7md98/ALGVx_ju4WiPjneRZ68crs8?rlkey=s919cfytsov4bafkvnufmpgwg&dl=1'

pathFiles = 'dados'
fileName = 'arquivos_competicao.zip'
filePath = os.path.join(pathFiles, fileName)
extractPath = os.path.join(pathFiles, 'dados')

if not os.path.isdir(pathFiles):
    os.mkdir(pathFiles)


response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(filePath, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print('Download finalizado.')
else:
    print(f'Houve um erro: {response.status_code}')


if zipfile.is_zipfile(filePath):
    with zipfile.ZipFile(filePath, 'r') as zip_ref:
        zip_ref.extractall(extractPath)
    print('Arquivo descompactado com sucesso.')
else:
    print('O arquivo baixado não é um arquivo zip válido.')


Download finalizado.
Arquivo descompactado com sucesso.


In [None]:
# Funções auxiliares para processamento de texto
def extract_text_from_xml(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        tree = ET.parse(file)
        root = tree.getroot()
        headline = root.find('.//headline')
        headline_text = headline.text.strip() if headline is not None else ''
        text_parts = []
        text_section = root.find('.//text')
        if text_section is not None:
            for p in text_section.findall('.//p'):
                if p.text:
                    text_parts.append(p.text.strip())
        return ' '.join([headline_text] + text_parts)
    

def tokenize_and_stem(text):
    tokenizer = RegexpTokenizer(r'\b\w+\b')
    portuguese_stop_words = stopwords.words('portuguese')
    stemmer = RSLPStemmer()
    text = re.sub(r'\d+', '', text)
    tokens = tokenizer.tokenize(text.lower())
    return [stemmer.stem(token) for token in tokens if token not in portuguese_stop_words]


In [None]:
data_path = './news/'
train_df = pd.read_csv('train.csv')
train_texts = [extract_text_from_xml(os.path.join(data_path, fname)) for fname in train_df['ID']]
train_labels = train_df['Class'].values
X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)


In [None]:
vectorizers = {
    'TF': CountVectorizer(binary=False, stop_words=stopwords.words('portuguese')),
    'TF-IDF': TfidfVectorizer(stop_words=stopwords.words('portuguese')),
    'Binary': CountVectorizer(binary=True, stop_words=stopwords.words('portuguese')),
}


# AUC Scorer para multi-classe
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

# Armazenar o melhor pipeline para cada vetorizador
pipelines = {}

# Testar diferentes vetorizadores e hiperparâmetros
for key, vectorizer in vectorizers.items():
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', MultinomialNB())
    ])
    param_grid = {
        'classifier__alpha': [0.01, 0.1, 1.0]
    }
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=make_scorer(multiclass_roc_auc_score))
    grid_search.fit(X_train, y_train)
    pipelines[key] = grid_search.best_estimator_
    print(f"AUC para o vetorizador {key}: {grid_search.best_score_:.2f}")



In [None]:
best_key = max(pipelines, key=lambda k: pipelines[k].score(X_train, y_train))
best_pipeline = pipelines[best_key]
y_pred = best_pipeline.predict(X_test)
test_auc = multiclass_roc_auc_score(y_test, y_pred)
print(f"AUC no conjunto de teste para o melhor vetorizador ({best_key}): {test_auc:.2f}")