In [27]:
# importa as libs
from utils.utils import *

import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [2]:
# Baixar recursos NLTK

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wectornanime\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Wectornanime\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wectornanime\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Wectornanime\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
# configuração

dataset_uri = './database/DisneylandReviews.csv'
dataset_encoding = 'latin1'

# 'bert-base-uncased' bom para inglês
# 'distilbert-base-uncased' mais leve e mais rápido
pre_trined_model_name = "distilbert-base-uncased"

In [4]:
# importa o dataset
df = pd.read_csv(dataset_uri, encoding=dataset_encoding)

## Análise e Tratamento de Dados (EDA)

In [5]:
# primeiras linhas do dataset
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [6]:
# informações da base
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          42656 non-null  int64 
 1   Rating             42656 non-null  int64 
 2   Year_Month         42656 non-null  object
 3   Reviewer_Location  42656 non-null  object
 4   Review_Text        42656 non-null  object
 5   Branch             42656 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.0+ MB


In [7]:
# verificação de valores nulos
df.isnull().sum()

Review_ID            0
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64

In [8]:
df['Rating'].value_counts()

Rating
5    23146
4    10775
3     5109
2     2127
1     1499
Name: count, dtype: int64

## Preparar o Rótulo (Sentimento)

In [9]:
# cria uma coluna chamada 'Sentiment'
df['Sentiment'] = df['Rating'].apply(get_sentiment)
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Sentiment
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,Positivo
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,Positivo
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,Positivo
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,Positivo
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,Positivo


In [10]:
# remove os sentimentos neutros
# para facilitar o classificador
df_final = df[df['Sentiment'] != 'Neutro'].copy()

In [11]:
# separa as colunas que queremos trabalhar
df_final = df_final[['Review_Text', 'Sentiment']]
df_final.head()

Unnamed: 0,Review_Text,Sentiment
0,If you've ever been to Disneyland anywhere you...,Positivo
1,Its been a while since d last time we visit HK...,Positivo
2,Thanks God it wasn t too hot or too humid wh...,Positivo
3,HK Disneyland is a great compact park. Unfortu...,Positivo
4,"the location is not in the city, took around 1...",Positivo


## Pré-processamento de Texto

In [12]:
# Inicializa o lematizador e stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [13]:
# Inicializa a função responsável por fazer o pré-processamento de texto

def preprocess_text(text):
    # Converte para minúsculas
    text = text.lower()
    # Remove tudo que não for letra
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokeniza
    tokens = text.split()
    # tokens = word_tokenize(text) # o recurso punkt esta dando erro
    # Remove stopwords e lemmatiza
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)



In [14]:
# Aplica o pré-processamento na coluna 'Review_Text'
df_final['Processed_Review'] = df_final['Review_Text'].apply(preprocess_text)

df_final[['Review_Text', 'Processed_Review']].head()

Unnamed: 0,Review_Text,Processed_Review
0,If you've ever been to Disneyland anywhere you...,youve ever disneyland anywhere youll find disn...
1,Its been a while since d last time we visit HK...,since last time visit hk disneyland yet time s...
2,Thanks God it wasn t too hot or too humid wh...,thanks god hot humid visiting park otherwise w...
3,HK Disneyland is a great compact park. Unfortu...,hk disneyland great compact park unfortunately...
4,"the location is not in the city, took around 1...",location city took around hour kowlon kid like...


## Extração de Features

In [15]:
# Dividir os dados em treino e teste
X = df_final['Processed_Review']
y = df_final['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
# usando TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Shape X_train_tfidf:", X_train_tfidf.shape)
print("Shape X_test_tfidf:", X_test_tfidf.shape)

Shape X_train_tfidf: (30037, 5000)
Shape X_test_tfidf: (7510, 5000)


## Modelagem e Classificação

In [22]:
# listas com as métricas

accuracy = {}
precision = {}
recall = {}
f1 = {}
confusion = {}

### Modelos Supervisionados Tradicionais (com TF-IDF)

In [23]:
# MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_model.fit(X_train_tfidf, y_train)
y_pred_mlp = mlp_model.predict(X_test_tfidf)

accuracy['mlp'] = accuracy_score(y_test, y_pred_mlp)
precision['mlp'] = precision_score(y_test, y_pred_mlp, average='weighted')
recall['mlp'] = recall_score(y_test, y_pred_mlp, average='weighted')
f1['mlp'] = f1_score(y_test, y_pred_mlp, average='weighted')
confusion['mlp'] = confusion_matrix(y_test, y_pred_mlp)

In [24]:
# SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

accuracy['svm'] = accuracy_score(y_test, y_pred_svm)
precision['svm'] = precision_score(y_test, y_pred_svm, average='weighted')
recall['svm'] = recall_score(y_test, y_pred_svm, average='weighted')
f1['svm'] = f1_score(y_test, y_pred_svm, average='weighted')
confusion['svm'] = confusion_matrix(y_test, y_pred_svm)

### Modelo Pré-treinado (BERT)



In [29]:
# Mapear rótulos para IDs (BERT espera IDs numéricos)
label_map = {'Negativo': 0, 'Positivo': 1}
id_map = {0: 'Negativo', 1: 'Positivo'}
df_final['labels'] = df_final['Sentiment'].map(label_map)

# Carregar Tokenizer e Modelo
tokenizer = AutoTokenizer.from_pretrained(pre_trined_model_name)
model = AutoModelForSequenceClassification.from_pretrained(pre_trined_model_name, num_labels=len(label_map))

# Preparar um dataset no formato da biblioteca `datasets`
# `datasets` é uma biblioteca otimizada para trabalhar com grandes volumes de texto para modelos Transformer
data_dict = {
    'text': df_final['Review_Text'].tolist(),
    'labels': df_final['labels'].tolist()
}
full_dataset = Dataset.from_dict(data_dict)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Dividir o dataset em treino e teste (usando o split do datasets para BERT)
train_test_split = full_dataset.train_test_split(test_size=0.2, stratify_by_column="labels", seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

ValueError: Stratifying by column is only supported for ClassLabel column, and column labels is Value.