In [4]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rafae\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rafae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rafae\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
df = pd.read_csv("Tweets.csv")
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [7]:
# 2. Pré-processamento de dados
def preprocess_text(text):
    if isinstance(text, float):  # Verifica se o valor é float
        return ''
        
    # Remove caracteres indesejados
    text = text.replace('#', '').replace('_', ' ')
    
    # Remove links
    text = ' '.join(word for word in text.split() if not word.startswith(('http', 'https')))
    
    # Remove menções de usuários
    text = ' '.join(word for word in text.split() if not word.startswith('@'))
    
    # Converte para minúsculas
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    text = ' '.join(word for word in tokens if word not in stop_words)
    
    # Lemmatização
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    text = ' '.join(lemmatizer.lemmatize(word) for word in tokens)
    
    return text

# Realiza o pré-processamento dos tweets
df['clean_text'] = df['text'].apply(preprocess_text)

In [8]:
# Cria o vetorizador de contagem de palavras
vectorizer = CountVectorizer()

# Ajusta o vetorizador aos dados completos
X = vectorizer.fit_transform(df['clean_text'])

In [9]:
# Extrai os recursos do texto
y = df['sentiment']

In [11]:
%%time

# Modelo de regressão logistica 

# Importação da biblioteca
from sklearn.linear_model import LogisticRegression

# Nome do algoritmo M.L
model_logistic = LogisticRegression() 

# Treinamento do modelo
model_logistic_fit = model_logistic.fit(X, y)

# Score do modelo dados treino x
model_logistic_score = model_logistic.score(X, y)

# Score do modelo dados treino y
print("Model - Logistic Regression: %.2f" % (model_logistic_score * 100))

Model - Logistic Regression: 88.96
CPU times: total: 578 ms
Wall time: 1.87 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# Previsão modelo com função predict de previsão das frases

model_logistic_pred = model_logistic.predict(X)
model_logistic_pred

array(['neutral', 'negative', 'negative', ..., 'positive', 'positive',
       'positive'], dtype=object)

In [16]:
# Acúracia do modelo de Regressão logística
from sklearn import metrics
from sklearn.metrics import accuracy_score

accuracy_dt = accuracy_score(y, model_logistic_pred)
print("Acurácia - Regressão logística: %.2f" % (accuracy_dt * 100))

Acurácia - Regressão logística: 88.96


In [17]:
# Classification report
from sklearn.metrics import classification_report

classification = classification_report(model_logistic_pred, y)
print("Modelo - Regressão logística")
print()
print(classification)

Modelo - Regressão logística

              precision    recall  f1-score   support

    negative       0.85      0.91      0.88      7275
     neutral       0.91      0.86      0.89     11783
    positive       0.89      0.91      0.90      8423

    accuracy                           0.89     27481
   macro avg       0.89      0.89      0.89     27481
weighted avg       0.89      0.89      0.89     27481



In [18]:
from sklearn.metrics import confusion_matrix

matrix_1 = confusion_matrix(model_logistic_pred, y)
matrix_1

array([[ 6649,   461,   165],
       [  889, 10137,   757],
       [  243,   520,  7660]], dtype=int64)

In [19]:
%%time

# Modelo machine learning - Naive bayes

# Importação da biblioteca
from sklearn.naive_bayes import MultinomialNB

# Nome do algoritmo M.L
model_naive_bayes = MultinomialNB()

# Treinamento do modelo
model_naive_bayes_fit = model_naive_bayes.fit(X, y)

# Score do modelo dados treino x
model_naive_bayes_scor = model_naive_bayes.score(X, y)

 # Score do modelo dados treino y
print("Model - Naive Bayes: %.2f" % (model_naive_bayes_scor * 100))

Model - Naive Bayes: 82.49
CPU times: total: 31.2 ms
Wall time: 78.6 ms


In [20]:
# Previsão modelo com função predict de previsã das frases

model_naive_bayes_pred = model_naive_bayes.predict(X)
model_naive_bayes_pred

array(['neutral', 'negative', 'negative', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [21]:
# Previsão modelo com função log_proba de probabilidades das frases

model_naive_bayes_prob = model_naive_bayes.predict_proba(X).round(2)
print(model_naive_bayes_prob)

[[0.22 0.61 0.17]
 [0.98 0.02 0.  ]
 [0.54 0.29 0.17]
 ...
 [0.   0.01 0.99]
 [0.19 0.28 0.52]
 [0.01 0.43 0.55]]


In [22]:
# Acúracia do modelo de Naive bayes
accuracy_naive_bayes = metrics.accuracy_score(y, model_naive_bayes_pred)
print("Accuracy model Naive bayes: %.2f" % (accuracy_naive_bayes * 100))

Accuracy model Naive bayes: 82.49


In [23]:
# Classification report
classification = classification_report(model_naive_bayes_pred, y)
print("Modelo - Naive bayes")
print()
print(classification)

Modelo - Naive bayes

              precision    recall  f1-score   support

    negative       0.78      0.86      0.82      7025
     neutral       0.85      0.79      0.82     11927
    positive       0.84      0.84      0.84      8529

    accuracy                           0.82     27481
   macro avg       0.82      0.83      0.83     27481
weighted avg       0.83      0.82      0.82     27481



In [25]:
# Confusion matrix
matrix_2 = confusion_matrix(model_naive_bayes_pred, y)
matrix_2

array([[6068,  723,  234],
       [1361, 9410, 1156],
       [ 352,  985, 7192]], dtype=int64)

In [26]:
import pickle

# Salvar o modelo em um arquivo
modelo_rl = 'modelo_rl.pkl'
with open(modelo_rl, 'wb') as file:
    pickle.dump(model_logistic, file)
    
    
# Salvar o modelo em um arquivo
modelo_naive = 'modelo_naive.pkl'
with open(modelo_naive, 'wb') as file:
    pickle.dump(model_naive_bayes, file)