<a href="https://colab.research.google.com/github/PauloPrudente/AnaliseSentimento/blob/main/AnaliseSentimentos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://www.linkedin.com/pulse/realizando-an%C3%A1lise-de-sentimento-partir-coment%C3%A1rios-da-rodrigo-correa/
https://www.linkedin.com/pulse/analisando-sentimento-em-notas-de-app-python-parte-2-rodrigo-correa/
https://www.linkedin.com/pulse/analisando-sentimento-em-notas-de-app-python-parte-3-rodrigo-correa/

In [None]:
pip install google-play-scraper

In [None]:
#Importando o comando app da library google-play-scraper
from google_play_scraper import app

In [None]:
result = app('com.rappi.storekeeper', lang = 'pt', country = 'br')
result

In [None]:
#Baixando todos os reviews do app.
from google_play_scraper import Sort, reviews_all

#Comando para todas as reviews

Reviews = reviews_all( 'com.rappi.storekeeper', lang = 'pt', country = 'br', sort = Sort.MOST_RELEVANT, sleep_milliseconds = 0)

In [None]:
#Importando pandas.
import pandas as pd


#Transformando os dados em um DataFrame para trabalharmos as análises.
Reviews_Rappi = pd.DataFrame(Reviews)

#Verificando a serie de dados.
Reviews_Rappi

In [None]:
#Importando o nltk e salvando os corpus necessários

import nltk
nltk.download('wordnet')
nltk.download('punkt')

#Aplicando uma função para tokenizar por palavra

Reviews_Rappi['content'] = Reviews_Rappi.apply(lambda row: nltk.word_tokenize(row['content']), axis=1) # Tokenização dos dados

In [None]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
language = 'portuguese'

#Criando a lista de stopwords
stopwords = stopwords.words(language)
stopwords = list(set(stopwords))

In [None]:
def remove_stopwords(words):
    """Remover as Stopwords das palavras tokenizadas"""
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
    return new_words

In [None]:
def to_lowercase(words):
    """converter todos os caracteres para lowercase"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

In [None]:
def remove_punctuation(words):
    """remover pontuação"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    

    return new_words

In [None]:
def normalize(words):
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    
    return ' '.join(words)

In [None]:
Reviews_Rappi['content'] = Reviews_Rappi.apply(lambda row: normalize(row['content']), axis=1)

In [None]:
#Importando o Léxico de Palavras com polaridades
sentilexpt = open('SentiLex-lem-PT02.txt')

#Criando um dicionário de palavras com a respectiva polaridade.
dic_palavra_polaridade = {}
for i in sentilexpt.readlines():
  pos_ponto = i.find('.')
  palavra = (i[:pos_ponto])
  pol_pos = i.find('POL')
  polaridade = (i[pol_pos+7:pol_pos+9]).replace(';', '')
  dic_palavra_polaridade[palavra] = polaridade


#Verificando o dicionário

dic_palavra_polaridade

In [None]:
#Criando uma função chamada "Score de Sentimento" para determinar os #sentimentos associados
def Score_sentimento(frase):
    frase = frase.lower()
    l_sentimento = []
    for p in frase.split():
        l_sentimento.append(int(dic_palavra_polaridade.get(p, 0)))
    score = sum(l_sentimento)
    if score > 0:
        return 'Pos {} '.format(score)
    elif score == 0:
        return 'Neu {} '.format(score)
    else:
        
        return 'Neg {}'.format(score)

In [None]:
#Criando uma função para aplicar um score de sentimento para cada um dos comentários, a partir das palavras positivas e negativas.
Reviews_Rappi['sentimento'] = Reviews_Rappi.apply(lambda row: Score_sentimento(row['content']), axis=1)

In [None]:
#Reorganizando o resultado em colunas para posteriormente lançar no modelo
Reviews_Rappi['Score_Sentimento'] = Reviews_Rappi['sentimento'].str.slice(-2)
Reviews_Rappi['Score_Sentimento'] =Reviews_Rappi['Score_Sentimento'].astype(int)
Reviews_Rappi['Sent'] = Reviews_Rappi['sentimento'].str.slice(0,-3)

In [None]:
#Verificando como ficou a distribuição de comentários a partir do Score de Sentimento Criado.
Reviews_Rappi.groupby('Score_Sentimento').count()

In [None]:
#criando um objeto somente com os comentários
content = Reviews_Rappi['content']

In [None]:
#juntando todos eles para construir a wordcloud - ela tem que estar todo contido numa string
all_content = "".join(c for c in content)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
#importando as libraries necessárias para o wordcloud
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
#montando um novo dicionário de stopwords
stopwords = set(STOPWORDS)

In [None]:
#Após review das palavras, adicionando alguns termos "sujeira" encontrados nas nuvens
stopwords.update(["pra", "app", "aplicativo", "vc", "pra", "to", "os", "rappi", "vcs", "nao", "pq", "mim", "ai", "ta", "ja", "ter", "fazer", "lá", "deu", "dado", "então", "vou", "vai", "veze", "ficar", "tá", "apena"])

In [None]:
#Criando o objeto wordcloud com as configs necessárias. Cor escolhida = preta, origem dos dados = all_content
wordcloud = WordCloud(stopwords=stopwords,background_color='black', width=1600,height=800).generate(all_content)
#configurando forma de apresentação do gráfico e apresentando no notebook.
fig, ax = plt.subplots(figsize=(16,8))            
ax.imshow(wordcloud, interpolation='bilinear')       
ax.set_axis_off()
plt.imshow(wordcloud)

In [None]:
#Criando agora um Dataset apenas com o que vamos usar no modelo. Não preciso incluir nenhum detalhe sobre quem deu o review.
Rappi = Reviews_Rappi[['content', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'score',  'Score_Sentimento', 'Sent']]

In [None]:
#Realizando o mesmo processo, porém agora para avaliações negativas - notas 1 e 2
Negative = Rappi[Rappi.score < 3]
Neg_Content = Negative['content']
all_neg_content = "".join(c for c in Neg_Content)
wordcloud = WordCloud(stopwords=stopwords,
                      background_color='orange', width=1600,                            
                      height=800).generate(all_neg_content)


fig, ax = plt.subplots(figsize=(16,8))            
ax.imshow(wordcloud, interpolation='bilinear')       
ax.set_axis_off()
plt.imshow(wordcloud)

In [None]:
#Realizando o mesmo procedimento, para avaliações consideradas neutras (Nota = 3)
Neutral = Rappi[Rappi.score == 3]
Neu_Content = Neutral['content']
all_neu_content = "".join(c for c in Neu_Content)
wordcloud = WordCloud(stopwords=stopwords,
                      background_color='blue', width=1600,                            
                      height=800).generate(all_neu_content)


fig, ax = plt.subplots(figsize=(16,8))            
ax.imshow(wordcloud, interpolation='bilinear')       
ax.set_axis_off()
plt.imshow(wordcloud)

In [None]:
#Finalmente, realizando o procedimento para notas chamadas Positivas, (4 e 5)
Positive = Rappi[Rappi.score > 3]
Pos_Content = Positive['content']
all_pos_content = "".join(c for c in Pos_Content)
wordcloud = WordCloud(stopwords=stopwords,
                      background_color='green', width=1600,                            
                      height=800).generate(all_pos_content)


fig, ax = plt.subplots(figsize=(16,8))            
ax.imshow(wordcloud, interpolation='bilinear')       
ax.set_axis_off()
plt.imshow(wordcloud)

In [None]:
# Vetorização (Converter texto e números).
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(max_features=1000)                
data_features = vectorizer.fit_transform(Rappi['content'])


data_features = data_features.toarray()       

In [None]:
labels = Rappi['score'].values

In [None]:
# Split data into training and testing set.


from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(data_features, labels, test_size=0.3, random_state=42)

In [None]:
# Usando Random Forest para classificar os reviews.
# Também calculando o Score Cross Validated.


import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


forest = RandomForestClassifier(n_estimators=10, n_jobs=4)


forest = forest.fit(X_train, y_train)


print(forest)


print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

In [None]:
result = forest.predict(X_test)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
sns.set("poster")
sns.set_style('whitegrid')
conf_mat = confusion_matrix(y_test, result)
cmap = sns.diverging_palette(220, 10, as_cmap = True)
print(conf_mat)

df_cm = pd.DataFrame(conf_mat, index = [i for i in "12345"],
                  columns = [i for i in "12345"])
plt.figure(figsize = (10,7))

sns.heatmap(df_cm,cmap=cmap, annot=True, fmt='g').set_title('Confusion Matrix para Modelo Random Forest')

In [None]:
from sklearn.linear_model import LogisticRegression
LogReg = LogisticRegression(max_iter = 10000)

LogReg = LogReg.fit(X_train, y_train)

print(LogReg)

print(np.mean(cross_val_score(LogReg, data_features, labels, cv=10)))

In [None]:
result_logreg = LogReg.predict(X_test)
conf_mat = confusion_matrix(y_test, result_logreg)

cmap = sns.diverging_palette(120, 50, as_cmap = True)
print(conf_mat)

df_cm = pd.DataFrame(conf_mat, index = [i for i in "12345"],
                  columns = [i for i in "12345"])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm,cmap=cmap, annot=True, fmt='g').set_title('Confusion Matrix para Modelo Logistic Regression')

In [None]:
X = Rappi['content']
y = Rappi['score']
#Fazendo um novo split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
random_state = 42)

In [None]:
cvec = CountVectorizer(max_features = 1000).fit(X_train)
#Vamos chamar o primeiro train set de df_train
df_train = pd.DataFrame(cvec.transform(X_train).todense(), columns = cvec.get_feature_names())
df_test = pd.DataFrame(cvec.transform(X_test).todense(), columns=cvec.get_feature_names())
print(df_train.shape)
print(y_train.shape)
print(df_test.shape)
print(y_test.shape)

In [None]:
X = Rappi['reviewCreatedVersion'].apply(str)
y = Rappi['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
cvec = CountVectorizer(max_features = 1000).fit(X_train)

Version_train = pd.DataFrame(cvec.transform(X_train).todense(), columns = cvec.get_feature_names())
Version_test = pd.DataFrame(cvec.transform(X_test).todense(), columns=cvec.get_feature_names())

In [None]:
X = Rappi['thumbsUpCount'].apply(str)
y = Rappi['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
cvec = CountVectorizer(max_features = 1000).fit(X_train)
Thumbs_train = pd.DataFrame(cvec.transform(X_train).todense(), columns = cvec.get_feature_names())
Thumbs_test = pd.DataFrame(cvec.transform(X_test).todense(), columns=cvec.get_feature_names())

In [None]:
X = Rappi['Sent'].apply(str)
y = Rappi['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
cvec = CountVectorizer(max_features = 1000).fit(X_train)
Sent_train = pd.DataFrame(cvec.transform(X_train).todense(), columns = cvec.get_feature_names())
Sent_test = pd.DataFrame(cvec.transform(X_test).todense(), columns=cvec.get_feature_names())

In [None]:
X = Rappi['at'].apply(str)
y = Rappi['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
cvec = CountVectorizer(max_features = 1000).fit(X_train)
at_train = pd.DataFrame(cvec.transform(X_train).todense(), columns = cvec.get_feature_names())
at_test = pd.DataFrame(cvec.transform(X_test).todense(), columns=cvec.get_feature_names())

In [None]:
train = pd.concat ([df_train, Sent_train, Thumbs_train, Version_train, at_train], axis = 1)
test = pd.concat([df_test, Sent_test, Thumbs_test, Version_test, at_test], axis = 1)
print(train.shape)
print(test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
forest = RandomForestClassifier(n_estimators=10, n_jobs=4)
forest = forest.fit(train, y_train)
print(forest)
print(np.mean(cross_val_score(forest,test, y_test, cv=10)))

In [None]:
LogReg = LogisticRegression(max_iter = 10000)

LogReg = LogReg.fit(train, y_train)

print(LogReg)

print(np.mean(cross_val_score(LogReg, test, y_test, cv=10)))