## PACOTES

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pickle

## CARREGAR EXCEL

In [None]:
# Carregando o conjunto de dados pré-processado
df = pd.read_csv('/content/preprocessed2_data_removido.csv')

## MODELAGEM

In [None]:
# Definindo colunas de texto e rótulos
text_column = 'text'
label_column = 'category'

# Separando dados de treino e teste
X_treino, X_teste, y_treino, y_teste = train_test_split(
  df[text_column], df[label_column], test_size=0.2, random_state=42
)

# Codificando rótulos categóricos
label_encoder = LabelEncoder()
y_treino = label_encoder.fit_transform(y_treino)
y_teste = label_encoder.transform(y_teste)

# Definindo o modelo de Regressão Logística
modelo = LogisticRegression()

# Grades de hiperparâmetros
grades_parametros = {
  'C': [0.01, 0.1, 1],
  'max_iter': [1000, 2000, 5000],
}

# Criando pipelines para diferentes representações de texto
pipelines = []

# 1. TfidfVectorizer com normalização L2
pipeline_tfidf_l2 = Pipeline([
  ('tfidf', TfidfVectorizer(max_features=10000, norm="l2")),
  ('clf', GridSearchCV(modelo, grades_parametros, cv=5, scoring='accuracy')),
])
pipelines.append(('TfidfVectorizer L2', pipeline_tfidf_l2))

# Treinando e avaliando modelos
for nome_modelo, pipeline in pipelines:
  print(f"** Treinando e avaliando {nome_modelo} com GridSearchCV...")

# Treinando o pipeline
  pipeline.fit(X_treino, y_treino)

# Salvando o modelo treinado e o vectorizer
  with open(f'{nome_modelo}_model.pkl', 'wb') as model_file:
   pickle.dump(pipeline, model_file)

  with open(f'{nome_modelo}_vectorizer.pkl', 'wb') as vectorizer_file:
   pickle.dump(pipeline.named_steps['tfidf'], vectorizer_file)  # Use 'tfidf' here

# Previsão e avaliação no conjunto de teste
  y_pred = pipeline.predict(X_teste)
  print(f"Classification Report para {nome_modelo}:")
  print(classification_report(y_teste, y_pred, target_names=label_encoder.classes_))

# Salvando o codificador de rótulos
with open('label_encoder.pkl', 'wb') as encoder_file:
  pickle.dump(label_encoder, encoder_file)

** Treinando e avaliando TfidfVectorizer L2 com GridSearchCV...


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-1' coro=<Server.serve() done, defined at /usr/local/lib/python3.10/dist-packages/uvicorn/server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-c78b2a3fd09a>", line 41, in <cell line: 41>
    run_with_ngrok(app)
  File "<ipython-input-11-c78b2a3fd09a>", line 38, in run_with_ngrok
    asyncio.get_event_loop().run_until_complete(uvicorn.run(app, host='0.0.0.0', port=8000))
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/main.py", line 575, in run
    server.run()
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 30, in run
    return loo

Classification Report para TfidfVectorizer L2:
              precision    recall  f1-score   support

     ciencia       0.94      0.73      0.82       127
      comida       0.92      0.65      0.76        92
   cotidiano       0.90      0.95      0.92      1638
    educacao       0.86      0.86      0.86       236
     esporte       0.99      0.98      0.98      1968
     mercado       0.93      0.93      0.93      2101
       mundo       0.94      0.97      0.95      1728
       poder       0.94      0.93      0.93      2222
     turismo       0.93      0.75      0.83       189

    accuracy                           0.94     10301
   macro avg       0.93      0.86      0.89     10301
weighted avg       0.94      0.94      0.94     10301



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import pickle

# Suponha que você tenha um DataFrame df contendo seus dados

# Defina colunas de texto e rótulos
text_column = 'text'
label_column = 'category'

# Filtrar o DataFrame para incluir apenas as categorias selecionadas
categorias_selecionadas = ['poder', 'mercado', 'esporte']
df_selecionado = df[df[label_column].isin(categorias_selecionadas)]

# Separar dados de treino e teste
X_treino, X_teste, y_treino, y_teste = train_test_split(
    df_selecionado[text_column], df_selecionado[label_column], test_size=0.2, random_state=42
)

# Vetorização do texto
vectorizer = TfidfVectorizer(max_features=10000)
X_treino_vec = vectorizer.fit_transform(X_treino)
X_teste_vec = vectorizer.transform(X_teste)

# Treinamento do modelo de regressão logística
modelo = LogisticRegression()
modelo.fit(X_treino_vec, y_treino)

# Avaliação do modelo
y_pred = modelo.predict(X_teste_vec)
print(classification_report(y_teste, y_pred))

# Salvando o modelo treinado e o vetorizador
with open('modelo_selecionado.pkl', 'wb') as model_file:
    pickle.dump(modelo, model_file)

with open('vetorizador_selecionado.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-4' coro=<Server.serve() done, defined at /usr/local/lib/python3.10/dist-packages/uvicorn/server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-17-cfcb2d9b31cf>", line 52, in <cell line: 50>
    uvicorn.run(app, host="0.0.0.0", port=8000)
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/main.py", line 575, in run
    server.run()
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_

              precision    recall  f1-score   support

     esporte       0.99      0.99      0.99      2013
     mercado       0.97      0.97      0.97      2099
       poder       0.96      0.97      0.97      2161

    accuracy                           0.97      6273
   macro avg       0.98      0.97      0.97      6273
weighted avg       0.97      0.97      0.97      6273

