In [None]:
import pandas as pd
from nltk.util import ngrams
from collections import Counter

In [None]:
# Carregamento dos dados
path = "_Data__QuintoAndar_-_Business_Case__1_.xlsx"
sheet_name = "Tenant reviews given after prop"

In [None]:
# Criação do dataframe e verificação
dfTenant_Reviews = pd.read_excel(path, sheet_name=sheet_name)
dfTenant_Reviews.sample()

In [None]:
dfTenant_Reviews = dfTenant_Reviews.drop(columns=["Unnamed: 18", "Unnamed: 19"])
dfTenant_Reviews = dfTenant_Reviews.rename(columns={"Others.1": "Comments"})
dfTenant_Reviews.sample()

In [None]:

# Pré-processamento dos comentários
def preprocess_comments(comment):
    if isinstance(comment, str):
        comment = comment.lower()
        comment = ''.join(e for e in comment if e.isalnum() or e.isspace())
    else:
        comment = ''
    return comment

# Aplicando pré-processamento nos comentários
dfTenant_Reviews["Comments"] = dfTenant_Reviews["Comments"].apply(preprocess_comments)

# Função para tokenização em conjuntos de 3 palavras
def get_trigrams(comment):
    words = comment.split()
    trigrams = list(ngrams(words, 3))
    return trigrams

# Criando DataFrames para as categorias de review 4 e 5, 1 e 2, e geral
df_reviews_4_5 = dfTenant_Reviews[dfTenant_Reviews["Review"].isin([4, 5])]
df_reviews_1_2 = dfTenant_Reviews[dfTenant_Reviews["Review"].isin([1, 2])]

# Tokenização em conjuntos para as categorias de review 4 e 5
trigram_list_reviews_4_5 = df_reviews_4_5["Comments"].apply(get_trigrams).tolist()
trigrams_reviews_4_5 = [trigram for sublist in trigram_list_reviews_4_5 for trigram in sublist]

# Tokenização em conjuntos para as categorias de review 1 e 2
trigram_list_reviews_1_2 = df_reviews_1_2["Comments"].apply(get_trigrams).tolist()
trigrams_reviews_1_2 = [trigram for sublist in trigram_list_reviews_1_2 for trigram in sublist]

# Tokenização em conjuntos para o geral
trigram_list_overral = dfTenant_Reviews["Comments"].apply(get_trigrams).tolist()
trigrams_overral = [trigram for sublist in trigram_list_overral for trigram in sublist]

# Contagem dos conjuntos para as categorias de review 4 e 5
trigram_counts_reviews_4_5 = Counter(trigrams_reviews_4_5)

# Contagem dos conjuntos para as categorias de review 1 e 2
trigram_counts_reviews_1_2 = Counter(trigrams_reviews_1_2)

# Contagem dos conjuntos para o geral
trigram_counts_overral = Counter(trigrams_overral)

# 10 conjuntos mais frequentes para as categorias de review 4 e 5
top_10_trigrams_reviews_4_5 = trigram_counts_reviews_4_5.most_common(10)

# 10 conjuntos mais frequentes para as categorias de review 1 e 2
top_10_trigrams_reviews_1_2 = trigram_counts_reviews_1_2.most_common(10)

# 10 conjuntos mais frequentes para o geral
top_10_trigrams_overral = trigram_counts_overral.most_common(10)

# Criando DataFrames para os resultados
df_top_10_trigrams_reviews_4_5 = pd.DataFrame(top_10_trigrams_reviews_4_5, columns=["Trigram", "Frequency (4 and 5)"])
df_top_10_trigrams_reviews_1_2 = pd.DataFrame(top_10_trigrams_reviews_1_2, columns=["Trigram", "Frequency (1 and 2)"])
df_top_10_trigrams_overral = pd.DataFrame(top_10_trigrams_overral, columns=["Trigram", "Frequency (Overall)"])


In [None]:
df_top_10_trigrams_reviews_4_5

In [None]:
df_top_10_trigrams_reviews_1_2

In [None]:
df_top_10_trigrams_overral