# Etapa de preprocesado de texto

## 1 Ipmports y librerias

In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## 2. Descarga

In [2]:
data = pd.read_csv("sampled_video_games_reviews.csv")
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentimiento
0,A26OSCHX5T9G0B,B000FKBCX4,S. Daylor,"[3, 6]","I really wanted to buy this game, but since he...",1,DRM is evil,1220918400,"09 9, 2008",Negativa
1,A26Y3GDR2X0V6V,B002SU4QG4,Matthew Eland,"[15, 21]",I have never played a Bethesda Softworks game ...,1,Terrible Customer Support,1288310400,"10 29, 2010",Negativa
2,A1ZNHMSPJ6Y8BQ,B0013B5NCK,Gaming Mama,"[0, 0]",I thought the idea of this game sounded fun......,1,Snooooooze fest!!!!,1344643200,"08 11, 2012",Negativa
3,A2YPWCPDVXQWDU,B00C1ZBFTW,Victor J. Godfrey,"[2, 25]",This is absolutely the worst game I have ever ...,1,What is wrong with the once greatly respected ...,1381363200,"10 10, 2013",Negativa
4,ABX6LJM70TD2E,B001CLYL1K,TELEESA SMITH,"[0, 1]",BOUGHT AS GIFT FOR MY BROTHER HE HAD PREVIOUSL...,1,WOLFENSTEIN FOR COMPUTER,1360627200,"02 12, 2013",Negativa


## 3. Función de preprocesamiento

In [3]:
def preprocess_text(text):
    if pd.isnull(text):
        return ""

    # Eliminar etiquetas HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # Normalizar codificación a UTF-8 y eliminar acentos/dieresis
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Transformar a minúsculas
    text = text.lower()

    # Eliminar caracteres que no sean letras o números
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Tokenizar y eliminar stopwords
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords.words('english')]

    return " ".join(tokens)

## 4. Aplicar preoprocesamiento

In [4]:
data['reviewText_clean'] = data['reviewText'].apply(preprocess_text)

In [5]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentimiento,reviewText_clean
0,A26OSCHX5T9G0B,B000FKBCX4,S. Daylor,"[3, 6]","I really wanted to buy this game, but since he...",1,DRM is evil,1220918400,"09 9, 2008",Negativa,really wanted buy game since hearing evil drm ...
1,A26Y3GDR2X0V6V,B002SU4QG4,Matthew Eland,"[15, 21]",I have never played a Bethesda Softworks game ...,1,Terrible Customer Support,1288310400,"10 29, 2010",Negativa,never played bethesda softworks game result ho...
2,A1ZNHMSPJ6Y8BQ,B0013B5NCK,Gaming Mama,"[0, 0]",I thought the idea of this game sounded fun......,1,Snooooooze fest!!!!,1344643200,"08 11, 2012",Negativa,thought idea game sounded funtaking toy store ...
3,A2YPWCPDVXQWDU,B00C1ZBFTW,Victor J. Godfrey,"[2, 25]",This is absolutely the worst game I have ever ...,1,What is wrong with the once greatly respected ...,1381363200,"10 10, 2013",Negativa,absolutely worst game ever played even called ...
4,ABX6LJM70TD2E,B001CLYL1K,TELEESA SMITH,"[0, 1]",BOUGHT AS GIFT FOR MY BROTHER HE HAD PREVIOUSL...,1,WOLFENSTEIN FOR COMPUTER,1360627200,"02 12, 2013",Negativa,bought gift brother previously one could never...


## 5. Overall a respuesta binaria

In [6]:
def preprocess_target(rating):
    if rating < 4:
        return 0  # Negativa
    else:
        return 1  # Positiva

## 6. Aplicar respuesta binaria

In [7]:
data['target'] = data['overall'].apply(preprocess_target)

In [8]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentimiento,reviewText_clean,target
0,A26OSCHX5T9G0B,B000FKBCX4,S. Daylor,"[3, 6]","I really wanted to buy this game, but since he...",1,DRM is evil,1220918400,"09 9, 2008",Negativa,really wanted buy game since hearing evil drm ...,0
1,A26Y3GDR2X0V6V,B002SU4QG4,Matthew Eland,"[15, 21]",I have never played a Bethesda Softworks game ...,1,Terrible Customer Support,1288310400,"10 29, 2010",Negativa,never played bethesda softworks game result ho...,0
2,A1ZNHMSPJ6Y8BQ,B0013B5NCK,Gaming Mama,"[0, 0]",I thought the idea of this game sounded fun......,1,Snooooooze fest!!!!,1344643200,"08 11, 2012",Negativa,thought idea game sounded funtaking toy store ...,0
3,A2YPWCPDVXQWDU,B00C1ZBFTW,Victor J. Godfrey,"[2, 25]",This is absolutely the worst game I have ever ...,1,What is wrong with the once greatly respected ...,1381363200,"10 10, 2013",Negativa,absolutely worst game ever played even called ...,0
4,ABX6LJM70TD2E,B001CLYL1K,TELEESA SMITH,"[0, 1]",BOUGHT AS GIFT FOR MY BROTHER HE HAD PREVIOUSL...,1,WOLFENSTEIN FOR COMPUTER,1360627200,"02 12, 2013",Negativa,bought gift brother previously one could never...,0


## 7. Guardar prepocesamiento

In [9]:
processed_data = data[['reviewText_clean', 'target']]
processed_data.to_csv('processed_video_games_reviews.csv', index=False)

## 8. Comentarios

Se ha realizado un preprocesamiento exhaustivo del texto para garantizar:
 - Eliminación de ruido HTML
 - Normalización de caracteres
 - Limpieza de puntuación
 - Eliminación de stopwords
 - El target ha sido binarizado para adecuarlo a un problema de clasificación supervisada binaria (positivo/negativo)

El corpus ha quedado preparado para ser vectorizado y utilizado en modelos de clasificación.
Esta limpieza asegura que los algoritmos de Machine Learning trabajen sobre datos estructurados, homogéneos y libres de ruido innecesario.

Si dispusiéramos de más tiempo se me ocurre lo siguiente:
 - Aplicar lematización para mejorar la agrupación semántica de palabras (por ejemplo, "play", "playing", "played").
 - Realizar detección y eliminación de reviews duplicadas o extremadamente cortas.
 - Analizar la conservación selectiva de ciertas stopwords relevantes en el dominio de videojuegos (como "to" en "p2w" u "op" como "over powered").
 - Incorporar detección de expresiones negadas, como "not good", que invierten el sentimiento.
 - Expandir el corpus con datos de otras categorías relacionadas (e.g., accesorios de videojuegos) para mejorar la robustez del modelo.
 - Revisión y corrección de ortografía.
