In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
import numpy as np


In [None]:
!pip install kaggle==1.5.12

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ozzygomes","key":"8fe22947721bd006058b752c7b3d9955"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d augustop/portuguese-tweets-for-sentiment-analysis

Downloading portuguese-tweets-for-sentiment-analysis.zip to /content
 98% 220M/224M [00:02<00:00, 90.7MB/s]
100% 224M/224M [00:02<00:00, 81.6MB/s]


In [None]:
!unzip /content/portuguese-tweets-for-sentiment-analysis.zip


###Carregando os dados

In [None]:
df = pd.read_csv('/content/NoThemeTweets.csv')

In [None]:
df.shape

(785814, 5)

In [None]:
df['sentiment'].value_counts()

sentiment
Negativo    522707
Positivo    263107
Name: count, dtype: int64

como há muitos sentimentos negativos começarei com uma redução dos tweets negativos da base

In [None]:
#contando o numero de sentimentos positivos e negativos
positive_count = df['sentiment'].value_counts()['Positivo']
negative_count = df['sentiment'].value_counts()['Negativo']

# calculando quantos negativos terie que remover
samples_to_remove = negative_count - positive_count

# randomicamente selecionando os negativos para serem removidos
negative_indices = df[df['sentiment'] == 'Negativo'].index
indices_to_remove = np.random.choice(negative_indices, samples_to_remove, replace=False)

# criando um novo dataaframe sem o excesso de amostras negativoas
df = df.drop(indices_to_remove)

In [None]:
df['sentiment'].value_counts()

sentiment
Positivo    263107
Negativo    263107
Name: count, dtype: int64

In [None]:
df.columns

Index(['id', 'tweet_text', 'tweet_date', 'sentiment', 'query_used'], dtype='object')

In [None]:


# Baixar recursos necessários do NLTK
nltk.download('stopwords')

# Função para limpar e preprocessar os tweets
def preprocess_tweet(tweet):
    # Converter para minúsculas
    tweet = tweet.lower()
    # Remover URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remover mencões
    tweet = re.sub(r'@\w+', '', tweet)
    # Remover hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    # Remover números
    tweet = re.sub(r'\d+', '', tweet)
    # Remover pontuação
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remover stopwords e aplicar stemming
    stop_words = set(stopwords.words('portuguese'))
    ps = PorterStemmer()
    tweet = ' '.join([ps.stem(word) for word in tweet.split() if word not in stop_words])
    return tweet



# Preprocessar os tweets
df['tweet_clean'] = df['tweet_text'].apply(preprocess_tweet)

# Vetorizar os tweets
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['tweet_clean'])

# Definir as variáveis dependente e independente
y = df['sentiment']

# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinar o modelo de Regressão Logística
model = LogisticRegression()
model.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = model.predict(X_test)

# Avaliar o modelo
print("Acurácia:", accuracy_score(y_test, y_pred))
print("Relatório de Classificação:\n", classification_report(y_test, y_pred))

# Opcional: salvar o modelo e o vetorizer
import joblib
joblib.dump(model, 'modelo_logistico.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acurácia: 0.7700179584390411
Relatório de Classificação:
               precision    recall  f1-score   support

    Negativo       0.78      0.76      0.77     52686
    Positivo       0.76      0.78      0.77     52557

    accuracy                           0.77    105243
   macro avg       0.77      0.77      0.77    105243
weighted avg       0.77      0.77      0.77    105243



['vectorizer.pkl']

#Testando modelo com novos dados

###1ª Empresa: Bradesco

In [None]:

dados_bradesco = pd.read_csv('/content/bradesco.csv')
dados_bradesco.drop_duplicates(inplace=True)

In [None]:
dados_bradesco['tweet_clean'] = dados_bradesco['text'].apply(preprocess_tweet)

In [None]:
predict_bradesco= model.predict(vectorizer.transform(dados_bradesco['tweet_clean']))

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

# Count the occurrences of each unique prediction
unique_predictions, counts = np.unique(predict_bradesco, return_counts=True)

# Create the bar plot using Plotly
# fig = px.bar(x=unique_predictions, y=counts,
#              labels={'x':'Predictions', 'y':'Count'},
#              title='Distribuição das Predições Bradesco')

colors = ['#1f77b4', '#ff7f0e']

# Criação do gráfico de barras
fig = go.Figure(data=[go.Bar(
    x=unique_predictions,
    y=counts,
    marker_color=colors  # Definindo cores das barras
)])

# Display the plot
fig.show()

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
# Count the occurrences of each unique prediction
y, counts = np.unique(predict_bradesco, return_counts=True)

# Create the bar plot using Plotly
fig = px.bar(x=unique_predictions, y=counts,
             labels={'x':'Predictions', 'y':'Count'},
             title='Distribuição das Predições Bradesco')



# Display the plot
fig.show()

In [None]:
pd.set_option('display.max_colwidth', None)  # Set to None to display all text
# print(dados_bradesco['tweet_clean'])

In [None]:
dados_bradesco['predict'] = predict_bradesco

In [None]:
dados_bradesco[dados_bradesco['predict'] == 'Negativo']

###2ª Empresa: Petrobrás

In [None]:

df1 = pd.read_csv('/content/petrobras.csv')
df2 = pd.read_csv('/content/petrobras2.csv')
df_petrobras = pd.concat([df1, df2])
df_petrobras.drop_duplicates(inplace=True)

In [None]:
df_petrobras['tweet_clean'] = df_petrobras['text'].apply(preprocess_tweet)

In [None]:

predict_petrobras = model.predict(vectorizer.transform(df_petrobras['tweet_clean']))

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

# Count the occurrences of each unique prediction
unique_predictions, counts = np.unique(predict_petrobras, return_counts=True)

# Create the bar plot using Plotly
fig = px.bar(x=unique_predictions, y=counts,
             labels={'x':'Predictions', 'y':'Count'},
             title='Distribuição das Predições Petrobrás')

# Display the plot
fig.show()

In [None]:
df_petrobras['predict'] = predict_petrobras

In [None]:
df_petrobras[df_petrobras['predict'] == 'Negativo']

###3ª Empresa BurgerKIngBR

In [None]:

df1 = pd.read_csv('/content/BurgerKingBR.csv')
df2 = pd.read_csv('/content/BurgerKingBR.csv')
df_BurgerKingBR = pd.concat([df1, df2])
df_BurgerKingBR.drop_duplicates(inplace=True)
# df_BurgerKingBR.head()

df_BurgerKingBR['tweet_clean'] = df_BurgerKingBR['text'].apply(preprocess_tweet)

predict_BurgerKingBR = model.predict(vectorizer.transform(df_BurgerKingBR['tweet_clean']))

import matplotlib.pyplot as plt
import plotly.express as px

# Count the occurrences of each unique prediction
unique_predictions, counts = np.unique(predict_BurgerKingBR, return_counts=True)

# Create the bar plot using Plotly
fig = px.bar(x=unique_predictions, y=counts,
             labels={'x':'Predictions', 'y':'Count'},
             title='Distribuição das Predições BurgerKingBR')

# Display the plot
fig.show()

In [None]:

df1 = pd.read_csv('/content/BurgerKingBR.csv')
df2 = pd.read_csv('/content/BurgerKingBR.csv')
df_BurgerKingBR = pd.concat([df1, df2])
df_BurgerKingBR.drop_duplicates(inplace=True)
# df_BurgerKingBR.head()

In [None]:
df_BurgerKingBR['tweet_clean'] = df_BurgerKingBR['text'].apply(preprocess_tweet)

In [None]:
predict_BurgerKingBR = model.predict(vectorizer.transform(df_BurgerKingBR['tweet_clean']))

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

# Count the occurrences of each unique prediction
unique_predictions, counts = np.unique(predict_BurgerKingBR, return_counts=True)

# Create the bar plot using Plotly
fig = px.bar(x=unique_predictions, y=counts,
             labels={'x':'Predictions', 'y':'Count'},
             title='Distribuição das Predições BurgerKingBR')

# Display the plot
fig.show()