In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import string
from nltk.corpus import stopwords
import nltk
import re
from collections import Counter
import math
import numpy as np

In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def process_text(text):
    if isinstance(text, str):
      text = text.lower()
      text = text.translate(str.maketrans('', '', string.punctuation))
      text = ' '.join(word for word in text.split()
                    if word.isalpha() and word not in stop_words and re.match(r'^[a-z]+$', word))
      return text
    else:
      return ''

df = pd.read_csv('Hotel_Reviews.csv')

# Aplicar o processamento e remover textos vazios
df['reviews.text'] = df['reviews.text'].apply(process_text)

# Remove linhas onde a coluna 'reviews.text' é vazia
df = df[df['reviews.text'].str.strip() != '']

# Reinicia os índices após a remoção
df.reset_index(drop=True, inplace=True)

df.to_csv('Processed_Hotel_Reviews.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Carrega o arquivo
file_path = 'Processed_Hotel_Reviews.csv'
df = pd.read_csv(file_path)

train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Divide o dataset em treinamento, validação e teste
train_df['partition'] = 'train'
val_df['partition'] = 'val'
test_df['partition'] = 'test'

In [None]:
# Combina os dataframes
combined_df = pd.concat([train_df, val_df, test_df])

# Cria o arquivo .tsv
corpus_df = combined_df[['reviews.text', 'partition', 'reviews.rating']]
corpus_file_path = 'corpus.tsv'
corpus_df.to_csv(corpus_file_path, sep='\t', index=False, header=False)

In [None]:
# Extrai palavras únicas criando o vocabulário
word_counter = Counter()

for review_text in df['reviews.text'].astype(str):
    for word in review_text.split():
        if word.isalpha():
            word_counter[word] += 1

vocabulary = {word for word, count in word_counter.items() if count >= 10}

vocabulary_file_path = 'vocabulary.txt'
with open(vocabulary_file_path, 'w') as vocab_file:
    vocab_file.write('\n'.join(sorted(vocabulary)))

# Remove NaN antes de coletar os labels
valid_labels = np.floor(combined_df['reviews.rating'].dropna()).unique().tolist()

# Cria o arquivo metadata.json
metadata = {
    "total_documents": len(df),
    "vocabulary_length": len(vocabulary),
    "preprocessing-info": [],
    "labels": sorted(valid_labels),
    "total_labels": combined_df['reviews.rating'].nunique(),
    "last-training-doc": int(train_df.index[-1]) + 1,
    "last-validation-doc": int(val_df.index[-1]) + 1,
}

metadata_file_path = 'metadata.json'
with open(metadata_file_path, 'w') as metadata_file:
    json.dump(metadata, metadata_file, indent=4)