In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import re
import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Baixar recursos adicionais do NLTK (se ainda não os tiver)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Carregar os dados
data = pd.read_excel('Dataset_Tweets.xlsx')

# Pré-processamento do texto
def preprocess_text(text):
    # Converter para minúsculas
    text = text.lower()
    # Remover caracteres especiais e números
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenização
    tokens = word_tokenize(text)
    # Remover stop words
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lematização
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Reunir tokens em uma string novamente
    processed_text = ' '.join(tokens)
    return processed_text

data['processed_tweets'] = data['tweets'].apply(preprocess_text)

# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(data['processed_tweets'], data['classificador'], test_size=0.45, random_state=42)

# Criar o pipeline de classificação
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

# Treinar o modelo
pipeline.fit(X_train, y_train)

# Avaliar o modelo
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

new_tweets = ["quero morrer.", "Que dia lindo para sair e aproveitar a vida!"]
new_tweets_processed = [preprocess_text(tweet) for tweet in new_tweets]
predictions_new_tweets = pipeline.predict(new_tweets_processed)
print(predictions_new_tweets)


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                             precision    recall  f1-score   support

Post  com Potencial Suicida       0.91      0.93      0.92       182
           Post Não Suicida       0.94      0.92      0.93       211

                   accuracy                           0.93       393
                  macro avg       0.93      0.93      0.93       393
               weighted avg       0.93      0.93      0.93       393

['Post  com Potencial Suicida' 'Post Não Suicida']
