In [1]:
# instala dependências extras

!pip install torch --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
# importa as libs
from utils.utils import *
from utils.bert_review_dataset import *

import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline

In [3]:
# Baixar recursos NLTK

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wectornanimefelipe\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wectornanimefelipe\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wectornanimefelipe\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wectornanimefelipe\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# configuração

dataset_uri = './database/DisneylandReviews.csv'
dataset_encoding = 'latin1'

# 'bert-base-uncased' bom para inglês
# 'distilbert-base-uncased' mais leve e mais rápido
#  'distilbert-base-uncased-finetuned-sst-2-english'
pre_trined_model_name = "distilbert-base-uncased"

In [5]:
# importa o dataset
df = pd.read_csv(dataset_uri, encoding=dataset_encoding)

## Análise e Tratamento de Dados (EDA)

In [None]:
# primeiras linhas do dataset
df.head()

In [None]:
# informações da base
df.info()

In [None]:
# verificação de valores nulos
df.isnull().sum()

In [None]:
df['Rating'].value_counts()

## Preparar o Rótulo (Sentimento)

In [6]:
# cria uma coluna chamada 'Sentiment'
df['Sentiment'] = df['Rating'].apply(get_sentiment)
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Sentiment
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,Positivo
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,Positivo
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,Positivo
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,Positivo
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,Positivo


In [32]:
# remove os sentimentos neutros
# para facilitar o classificador
df_final = df[df['Sentiment'] != 'Neutro'].copy()

In [37]:
# separa as colunas que queremos trabalhar
df_final = df_final[['Review_Text', 'Sentiment']]
df_final.head()

Unnamed: 0,Review_Text,Sentiment
0,If you've ever been to Disneyland anywhere you...,Positivo
1,Its been a while since d last time we visit HK...,Positivo
2,Thanks God it wasn t too hot or too humid wh...,Positivo
3,HK Disneyland is a great compact park. Unfortu...,Positivo
4,"the location is not in the city, took around 1...",Positivo


## Pré-processamento de Texto

In [9]:
# Inicializa o lematizador e stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [10]:
# Inicializa a função responsável por fazer o pré-processamento de texto

def preprocess_text(text):
    # Converte para minúsculas
    text = text.lower()
    # Remove tudo que não for letra
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokeniza
    tokens = text.split()
    # tokens = word_tokenize(text) # o recurso punkt esta dando erro
    # Remove stopwords e lemmatiza
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)



In [36]:
# Aplica o pré-processamento na coluna 'Review_Text'
df_final['Processed_Review'] = df_final['Review_Text'].apply(preprocess_text)

df_final[['Review_Text', 'Processed_Review']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Processed_Review'] = df_final['Review_Text'].apply(preprocess_text)


Unnamed: 0,Review_Text,Processed_Review
0,If you've ever been to Disneyland anywhere you...,youve ever disneyland anywhere youll find disn...
1,Its been a while since d last time we visit HK...,since last time visit hk disneyland yet time s...
2,Thanks God it wasn t too hot or too humid wh...,thanks god hot humid visiting park otherwise w...
3,HK Disneyland is a great compact park. Unfortu...,hk disneyland great compact park unfortunately...
4,"the location is not in the city, took around 1...",location city took around hour kowlon kid like...


## Extração de Features

### Modelos Supervisionados Tradicionais (com TF-IDF)

In [None]:
# Dividir os dados em treino e teste
X = df_final['Processed_Review']
y = df_final['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# usando TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Shape X_train_tfidf:", X_train_tfidf.shape)
print("Shape X_test_tfidf:", X_test_tfidf.shape)

### Modelo Pré-treinado (BERT)



In [14]:
# Mapear rótulos para IDs (BERT espera IDs numéricos)
label_map = {'Negativo': 0, 'Positivo': 1}

# Dividir treino e teste
texts = df_final['Processed_Review'].tolist()
labels = df_final['Sentiment'].map(label_map).tolist()

X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

In [15]:
# Carregar Tokenizer e Modelo

tokenizer = AutoTokenizer.from_pretrained(pre_trined_model_name)
model = AutoModelForSequenceClassification.from_pretrained(pre_trined_model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenização

train_encodings = tokenizer(X_train_bert, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test_bert, truncation=True, padding=True, max_length=128)

## Modelagem e Classificação

In [None]:
# listas com as métricas

accuracy = {}
precision = {}
recall = {}
f1 = {}
confusion = {}

### Modelos Supervisionados Tradicionais (com TF-IDF)

In [None]:
# MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(100), max_iter=500, random_state=42)
mlp_model.fit(X_train_tfidf, y_train)
y_pred_mlp = mlp_model.predict(X_test_tfidf)

accuracy['mlp'] = accuracy_score(y_test, y_pred_mlp)
precision['mlp'] = precision_score(y_test, y_pred_mlp, average='weighted')
recall['mlp'] = recall_score(y_test, y_pred_mlp, average='weighted')
f1['mlp'] = f1_score(y_test, y_pred_mlp, average='weighted')
confusion['mlp'] = confusion_matrix(y_test, y_pred_mlp)

In [None]:
# SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

accuracy['svm'] = accuracy_score(y_test, y_pred_svm)
precision['svm'] = precision_score(y_test, y_pred_svm, average='weighted')
recall['svm'] = recall_score(y_test, y_pred_svm, average='weighted')
f1['svm'] = f1_score(y_test, y_pred_svm, average='weighted')
confusion['svm'] = confusion_matrix(y_test, y_pred_svm)

### Modelo Pré-treinado (BERT)



In [40]:
# Criação do dataset PyTorch

train_df = pd.DataFrame({
    'text': X_train_bert,
    'label': y_train_bert
})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = ReviewDataset(test_encodings, y_test_bert)

In [27]:
# Argumentos de treinamento
bert_training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
)

In [28]:
# Treinador
bert_trainer = Trainer(
    model=model,
    args=bert_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [38]:
bert_trainer.train()

ValueError: You have to specify either input_ids or inputs_embeds