# Modelo de Machine Learning para el sistema de analisis de sentimientos

#### Importar todas las librerias a utilizar

In [1]:
import re

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import spacy
from transformers import DistilBertTokenizer,DistilBertForSequenceClassification

import torch
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load('en_core_web_sm')





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rhamer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rhamer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Carga de datos

In [2]:
# Cargar dataset de reviews
reviews = pd.read_parquet('reviews_final.parquet')
reviews

Unnamed: 0,text,gmap_id,fecha
0,love there korean rice cake,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2014-03-16 05:10:15
1,good very good,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2014-02-21 00:29:23
2,they make korean traditional food very properly,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-01-30 19:38:55
3,short ribs are very delicious,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2019-11-30 22:23:42
4,great food and prices the portions are large,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-07-15 13:11:12
...,...,...,...
281037,this place took over osaka ramen we tried the ...,0x808fe955b0beae57:0xb3159fe6572670c3,2017-07-02 19:41:03
281038,delicious ramen clean dinning room and good se...,0x808fe955b0beae57:0xb3159fe6572670c3,2020-11-05 01:30:44
281039,rich broth soft meat and fresh noodles,0x808fe955b0beae57:0xb3159fe6572670c3,2021-09-12 06:52:46
281040,best food ever,0x808fe955b0beae57:0xb3159fe6572670c3,2021-11-17 01:27:55


## preprocesamiento de textos

In [3]:

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Tokenizar las reseñas
# encoding = tokenizer(reviews['text'].tolist(), truncation=True, padding=True, return_tensors='pt', max_length=512)

# input_ids = encoding['input_ids']
# attention_masks = encoding['attention_mask']

# # Crear tensores de datos
# data = TensorDataset(input_ids, attention_masks)

# # Crear DataLoader
# batch_size = 16
# dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=batch_size)

#### Lematizacion y tokenizacion de las reviews para ser analizados por el modelo

<p>Utilizando la libreria nltk, nlp y torch se realiza el preprocesamiento de los datos incluidos en la review</p>

In [4]:
# Lista de abreviaturas específicas a eliminar.
abbreviations = {
    'asap', 'btw', 'brb', 'idk', 'imo', 'imho', 'lol', 'omg', 'tbh', 'ttyl', 'smh', 'np', 'ftw', 'fyi', 'bff',
    'lmao', 'rofl', 'xoxo', 'afaik', 'fml', 'gtg', 'hmu', 'icymi', 'ikr', 'ily', 'jk', 'nvm', 'roflmao', 'smh',
    'stfu', 'tldr', 'wtf', 'yw', 'bday', 'gr8', 'thx', 'pls', 'u', 'ur', 'yolo','xlent', 'xclnt', 'xlnt','xlbs', 'asap', 'btw', 'xd'
}

# Definir las stop_words.
stop_words = set(stopwords.words('english'))
stop_words.update(abbreviations)

# Funcion para remover las stop_words.
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# funcion para limpiar las reviews.
def clean_review(review):
    # Eliminar stopwords.
    review = remove_stopwords(review)
    # Eliminar palabras con caracteres repetitivos (ej: "aaaa", "ahh").
    review = ' '.join([word for word in review.split() if not re.search(r'(.)\1{2,}', word)])
    return review

# funcion para la lematizacion de las reviews.
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_text)



In [5]:
# Aplicar la lematización y la limpieza
reviews['text'] = reviews['text'].apply(lemmatize_text)

In [6]:
reviews['text'] = reviews['text'].apply(clean_review)

In [7]:
# Filtrar reseñas con menos de min_word_count palabras
min_word_count = 5
filtered_reviews = reviews[reviews['text'].apply(lambda x: len(x.split()) >= min_word_count)].reset_index(drop=True)


In [8]:
reviews

Unnamed: 0,text,gmap_id,fecha
0,love korean rice cake,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2014-03-16 05:10:15
1,good good,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2014-02-21 00:29:23
2,make korean traditional food properly,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-01-30 19:38:55
3,short rib delicious,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2019-11-30 22:23:42
4,great food price portion large,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-07-15 13:11:12
...,...,...,...
281037,place take osaka raman try black garlic raman ...,0x808fe955b0beae57:0xb3159fe6572670c3,2017-07-02 19:41:03
281038,delicious raman clean din room good service,0x808fe955b0beae57:0xb3159fe6572670c3,2020-11-05 01:30:44
281039,rich broth soft meat fresh noodle,0x808fe955b0beae57:0xb3159fe6572670c3,2021-09-12 06:52:46
281040,good food ever,0x808fe955b0beae57:0xb3159fe6572670c3,2021-11-17 01:27:55


### Crear y cargar el modelo preentrenado de DistilBert para la clasificacion de los reviews

In [9]:
# Descargar y cargar el modelo preentrenado para análisis de sentimientos
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = DistilBertForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Tokenizar las reseñas
encodings = tokenizer(list(filtered_reviews['text']), truncation=True, padding=True, max_length=128)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [10]:
# Crear el DataLoader
input_ids = torch.tensor(encodings['input_ids'])
attention_mask = torch.tensor(encodings['attention_mask'])
dataset = TensorDataset(input_ids, attention_mask)
dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=32)

# Mover el modelo a la GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Crear funcion y cargar resultado en el dataframe en una nueva columna llamada sentiments

In [11]:
# Funcion para analizar el texto.
def predict_sentiment(dataloader, model, device):
    model.eval()
    sentiments = []

    for batch in dataloader:
        batch_input_ids, batch_attention_mask = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).flatten()
        sentiments.extend(predictions.cpu().numpy())
    
    return sentiments

# Realizar predicciones
sentiments = predict_sentiment(dataloader, model, device)

# Añadir las predicciones al DataFrame
filtered_reviews['sentiment'] = ['positive' if sentiment == 1 else 'negative' for sentiment in sentiments]

# Mostrar las primeras filas del DataFrame con las predicciones
filtered_reviews.head()

Unnamed: 0,text,gmap_id,fecha,sentiment
0,make korean traditional food properly,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-01-30 19:38:55,positive
1,great food price portion large,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-07-15 13:11:12,positive
2,chicken sandwich delicious definitely twist fl...,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,2013-12-21 05:26:13,positive
3,love place fry garlic chicken crispy savory al...,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,2022-09-20 07:51:08,positive
4,delicious variety food good place go either qu...,0x80c2d765f8c90a3d:0x16afb75943e7ad50,2013-06-06 18:41:37,positive


### Crear archivo de tipo parquet y crear tabla en base de datos.

In [13]:
filtered_reviews.to_parquet('review_final_final.parquet', index=False)

In [14]:
filtered_reviews

Unnamed: 0,text,gmap_id,fecha,sentiment
0,make korean traditional food properly,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-01-30 19:38:55,positive
1,great food price portion large,0x80c2c778e3b73d33:0xbdc58662a4a97d49,2016-07-15 13:11:12,positive
2,chicken sandwich delicious definitely twist fl...,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,2013-12-21 05:26:13,positive
3,love place fry garlic chicken crispy savory al...,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,2022-09-20 07:51:08,positive
4,delicious variety food good place go either qu...,0x80c2d765f8c90a3d:0x16afb75943e7ad50,2013-06-06 18:41:37,positive
...,...,...,...,...
169904,maybe order delivery noodle hard eat soup room...,0x808fe955b0beae57:0xb3159fe6572670c3,2014-09-04 00:38:44,positive
169905,great food staff kind gentleman help tonight g...,0x808fe955b0beae57:0xb3159fe6572670c3,2018-06-05 03:31:51,positive
169906,place take osaka raman try black garlic raman ...,0x808fe955b0beae57:0xb3159fe6572670c3,2017-07-02 19:41:03,negative
169907,delicious raman clean din room good service,0x808fe955b0beae57:0xb3159fe6572670c3,2020-11-05 01:30:44,positive


In [12]:


# labels = [1, 0]  # 1 = positiva, 0 = negativa


# # Dividir los datos
# train_texts, val_texts, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# # Crear el dataset
# class ReviewDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx])
#         return item

#     def __len__(self):
#         return len(self.labels)

# train_dataset = ReviewDataset(train_encodings, train_labels)
# val_dataset = ReviewDataset(val_encodings, val_labels)

# # Configurar y entrenar el modelo
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# training_args = TrainingArguments(
#     output_dir='./results',          
#     num_train_epochs=3,              
#     per_device_train_batch_size=16,  
#     per_device_eval_batch_size=16,   
#     warmup_steps=500,                
#     weight_decay=0.01,               
#     logging_dir='./logs',            
#     logging_steps=10,
# )

# trainer = Trainer(
#     model=model,                         
#     args=training_args,                  
#     train_dataset=train_dataset,         
#     eval_dataset=val_dataset             
# )

# trainer.train()