## Remove stopwords, pontuação e converte para minusculo

In [205]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import string
from nltk.corpus import stopwords
import nltk
import re
from collections import Counter
import math

In [206]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def process_text(text):
    if isinstance(text, str):
      text = text.lower()
      text = text.translate(str.maketrans('', '', string.punctuation))
      text = ' '.join(word for word in text.split()
                    if word.isalpha() and word not in stop_words and re.match(r'^[a-z]+$', word))
      return text
    else:
      return ''

df = pd.read_csv('files/Hotel_Reviews.csv')

df['reviews.text'] = df['reviews.text'].apply(process_text)

# if 'rating' in df.columns:
#     # df['rating'] = df['rating'].floor()
#     df['reviews.ratings'] = df['reviews.ratings'].floor()
# else:
#     print("Column 'reviews.ratings' not found. Check the CSV file or column name.")

df.to_csv('Processed_Hotel_Reviews.csv', index=False)

# df[['reviews.text']].to_csv('Processed_Hotel_Reviews.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [210]:
# Carrega o arquivo
file_path = 'files/Processed_Hotel_Reviews.csv'
df = pd.read_csv(file_path)

train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Divide o dataset em treinamento, validação e teste
train_df['partition'] = 'train'
val_df['partition'] = 'val'
test_df['partition'] = 'test'

In [211]:
# Combina os dataframes
combined_df = pd.concat([train_df, val_df, test_df])

# Cria o arquivo .tsv
corpus_df = combined_df[['reviews.text', 'partition', 'reviews.rating']]
corpus_file_path = 'files/corpus.tsv'
corpus_df.to_csv(corpus_file_path, sep='\t', index=False, header=False)

In [212]:
# Extrai palavras únicas criando o vocabulário
word_counter = Counter()

for review_text in df['reviews.text'].astype(str):
    for word in review_text.split():
        if word.isalpha():
            word_counter[word] += 1

vocabulary = {word for word, count in word_counter.items() if count >= 10}

vocabulary_file_path = 'files/vocabulary.txt'
with open(vocabulary_file_path, 'w') as vocab_file:
    vocab_file.write('\n'.join(sorted(vocabulary)))

# Cria o arquivo metadata.json
metadata = {
    "total_documents": len(df),
    "vocabulary_length": len(vocabulary),
    "preprocessing-info": [],
    "labels": sorted(combined_df['reviews.rating'].unique().tolist()),
    "total_labels": combined_df['reviews.rating'].nunique(),
    "last-training-doc": int(train_df.index[-1]) + 1,
    "last-validation-doc": int(val_df.index[-1]) + 1,
}

metadata_file_path = 'files/metadata.json'
with open(metadata_file_path, 'w') as metadata_file:
    json.dump(metadata, metadata_file, indent=4)