In [17]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
import re

# Baixando os recursos necessários do NLTK
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    if isinstance(text, str):
        # Tokenização e remoção de stopwords
        tokens = word_tokenize(text.lower())
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        return ' '.join(filtered_tokens)
    return ''

def load_data(filepath):
    try:
        column_names = ['id', 'entity', 'sentiment', 'text']
        df = pd.read_csv(filepath, header=None, names=column_names)
        print("Dados carregados com sucesso.")
        return df
    except Exception as e:
        print(f"Erro ao carregar o dataset: {e}")
        return None

def main():
    print("Iniciando o programa...")

    # Carregando os dados de treinamento
    df_train = load_data("/content/twitter_training.csv")

    if df_train is not None:
        # Explorando os dados de treinamento
        print("\nNomes das colunas de treinamento:")
        print(df_train.columns)
        print("\nAmostra dos dados de treinamento:")
        print(df_train.head())

        if 'sentiment' in df_train.columns:
            print("\nDistribuição das classes de sentimento de treinamento:")
            print(df_train['sentiment'].value_counts())
        else:
            print("\nErro: A coluna 'sentiment' não foi encontrada no DataFrame de treinamento.")
            return

        # Pré-processamento dos dados de treinamento
        df_train['text'] = df_train['text'].fillna('').astype(str)
        df_train['clean_text'] = df_train['text'].apply(preprocess_text)

        # Separação dos dados em treino e teste
        X_train = df_train['clean_text']
        y_train = df_train['sentiment']

        # Criando o modelo de PLN, utilizando Naive Bayes
        model = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ])

        # Definindo o grid de parâmetros para ajuste
        parameters = {
            'tfidf__ngram_range': [(1,3)],
            'tfidf__max_features': [15000],
            'clf__alpha': [0.3]
        }

        # Grid Search para ajuste de hiperparâmetros
        grid_search = GridSearchCV(model, parameters, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)

        # Melhor modelo e parâmetros
        best_model = grid_search.best_estimator_
        print(f"\nMelhores Parâmetros: {grid_search.best_params_}")

        # Verificação cruzada com o melhor modelo
        cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)  # 5-fold cross-validation
        print("\nPontuações da Verificação Cruzada:")
        print(cv_scores)
        print(f"\nPontuação média da Verificação Cruzada: {np.mean(cv_scores):.4f}")

        # Carregando os dados de validação
        df_val = load_data("/content/twitter_validation.csv")

        if df_val is not None:
            # Pré-processamento dos dados de validação
            df_val['text'] = df_val['text'].fillna('').astype(str)
            df_val['clean_text'] = df_val['text'].apply(preprocess_text)

            # Separação dos dados de validação
            X_val = df_val['clean_text']
            y_val = df_val['sentiment']

            # Avaliação do modelo com o conjunto de validação
            y_pred_val = best_model.predict(X_val)
            print("\nRelatório de Classificação no conjunto de validação:")
            print(metrics.classification_report(y_val, y_pred_val))

            # Matriz de confusão no conjunto de validação
            conf_matrix_val = metrics.confusion_matrix(y_val, y_pred_val)
            print("\nMatriz de Confusão no conjunto de validação:")
            print(conf_matrix_val)

        # Realizando testes com frases de exemplo
        exemplo_frases = [
            "I love this product!",
            "This is the worst experience I've ever had.",
            "It's okay, not bad.",
            "Absolutely fantastic!",
            "I hate it so much.",
            "The game is a bit boring.",
            "I'm thrilled about this new update!",
            "The movie was quite mediocre.",
            "This is the best purchase I've ever made.",
            "I'm really disappointed with the service.",
            "The book was an interesting read.",
            "I feel neutral about this feature.",
            "This restaurant has excellent food.",
            "The app crashes frequently, very frustrating.",
            "I'm excited about the new release!",
            "The product arrived late and damaged.",
            "I enjoyed the concert a lot.",
            "The experience was quite underwhelming.",
            "I'm happy with my new phone.",
            "The software update was a huge improvement.",
            "I'm not satisfied with the customer support.",
            "The quality of the product exceeded my expectations.",
            "The weather was terrible during my vacation.",
            "I had an amazing time at the event.",
            "The service was slow but the food was good.",
            "I'm not impressed with the new design.",
            "The movie was entertaining and engaging.",
            "I felt let down by the recent changes.",
            "The trip was okay, nothing special.",
            "I love the new features in the latest version.",
            "The product is okay but could be better.",
            "The restaurant ambiance was lovely.",
            "I'm frustrated with the frequent bugs.",
            "The book was a great read, highly recommended!",
            "The service was excellent and prompt.",
            "The new update made everything worse.",
            "I feel indifferent about the new changes.",
            "The performance was outstanding.",
            "The quality did not meet my expectations.",
            "I am very pleased with the purchase.",
            "The user interface is much improved now.",
            "The concert was an unforgettable experience.",
            "The game is too repetitive and boring.",
            "I am thrilled with the customer service!",
            "The hotel was clean but the location was poor.",
            "I'm dissatisfied with the recent upgrade.",
            "The movie was a complete waste of time.",
            "The new product features are amazing!"
        ]

        predicoes = best_model.predict(exemplo_frases)
        for frase, sentimento in zip(exemplo_frases, predicoes):
            print(f'Frase: "{frase}"\nSentimento Predito: {sentimento}\n')

        # Análise crítica do modelo
        print("\nAnálise Crítica do Modelo:")
        print("Pontos fortes: O modelo ajustado pode apresentar melhorias em desempenho.")
        print("Pontos fracos: A detecção de sentimentos sutis ainda pode ser um desafio.")
        print("Oportunidades de melhorias: Continuar ajustando hiperparâmetros e explorar modelos mais avançados.")

        # Informações adicionais
        print("\nInformações adicionais:")
        print("Os dados foram divididos em 80% para treino e 20% para teste.")
        print("O pré-processamento incluiu a remoção de stopwords e conversão para minúsculas.")
        print("O modelo foi ajustado com Grid Search para melhorar o desempenho.")
        print("O modelo suporta análises em inglês.")
    else:
        print("Não foi possível carregar os dados. Verifique o caminho do arquivo e tente novamente.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Iniciando o programa...
Dados carregados com sucesso.

Nomes das colunas de treinamento:
Index(['id', 'entity', 'sentiment', 'text'], dtype='object')

Amostra dos dados de treinamento:
     id       entity sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  

Distribuição das classes de sentimento de treinamento:
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64
Fitting 5 folds for each of 1 candidates, totalling 5 fits

Melhores Parâmetros: {'clf__alpha': 0.3, 'tfidf_