# Gerando o dataset com as frequências e a nuvem de palavras (wordcloud)

Ambos podem ser gerados pelo dashboard, mas achei menos complexo e de melhor visualização colocar algo estático no dashboard

[Frequency](https://amueller.github.io/word_cloud/auto_examples/frequency.html)
[WordCloud code](https://amueller.github.io/word_cloud/auto_examples/frequency.html)

In [None]:
!pip install multidict



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from wordcloud import WordCloud

Por uma decisão de negócio, os datasets serão separados em avaliações positivas e negativas, para ficar de melhor visualização aos gestores.

In [None]:
#Importando e sepanrando os datasets
df_reviews = pd.read_csv("drive/My Drive/Colab Notebooks/TCC/dataset_completed.csv")
df_negative_reviews = df_reviews.loc[df_reviews['label'] == 0]
df_positive_reviews = df_reviews.loc[df_reviews['label'] == 1]

[TfidVectorizer](https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html)
[TfidVectorizer Docs](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [None]:
def get_terms_frequencies(dataset, top_n):
    """Retorna um dict com os termos mais frequentes, que serão usados então para
    gerar a wordcloud."""    
    
    """"Tomei a decisão de remover algumas palavras que seriam irrelevantes para o
    modelo, considerando o contexto""""
    stop_words = ["universal", "ride", "studios", "park", "day", "year", "month", "time",
                  "great", "good", "bad", "worst", "like"]

    #Inicia o TFIDF e treina o modelo com base nos tokens
    vectorizer_train = TfidfVectorizer(ngram_range=(1,1), strip_accents='ascii', stop_words=stop_words)
    train_vectors = vectorizer_train.fit_transform(dataset["tokens"])
    
    #Extrai e retorna as features e os pesos
    indices = np.argsort(vectorizer_train.idf_)[::-1]
    features = vectorizer_train.get_feature_names_out()
    top_n = top_n
    top_features = [features[i] for i in indices[-top_n:]]
    top_idf = [vectorizer_train.idf_[i] for i in indices[-top_n:]]
    top_idf.sort(reverse=False)

    frequencies_and_terms = zip(top_features, top_idf)
    frequencies_dict = dict(frequencies_and_terms)

    return frequencies_dict

In [None]:
#Divide as frequencias entre os datasets.
positive_frequency_dict = get_terms_frequencies(df_positive_reviews, 100)
negative_frequency_dict = get_terms_frequencies(df_negative_reviews, 100)

In [None]:
#Gera as wordclouds positivas e negativas
positive_wc = WordCloud(background_color="black", width=1000,height=1000, max_words=100).generate_from_frequencies(positive_frequency_dict)
positive_wc.to_file("drive/My Drive/Colab Notebooks/TCC/positive_wordcloud.jpg")

negative_wc = WordCloud(background_color="black", width=1000,height=1000, max_words=100).generate_from_frequencies(negative_frequency_dict)
negative_wc.to_file("drive/My Drive/Colab Notebooks/TCC/negative_wordcloud.jpg")

<wordcloud.wordcloud.WordCloud at 0x7f2306f9e1d0>

In [None]:
#Cria um dataframe com os termos e frequências gerados
df_positive_frequencies = pd.DataFrame.from_dict([positive_frequency_dict]).T.rename(columns={0:'frequency'}, inplace=False)
df_negative_frequencies = pd.DataFrame.from_dict([negative_frequency_dict]).T.rename(columns={0:'frequency'}, inplace=False)

In [None]:
#Salva o resultado dos dataframes em um dataset
df_positive_frequencies.to_csv("drive/My Drive/Colab Notebooks/TCC/positive_frequencies.csv")
df_positive_frequencies.to_csv("drive/My Drive/Colab Notebooks/TCC/negative_frequencies.csv")