In [9]:
# Librerías

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
import unicodedata

In [10]:
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Carga de corpus

In [11]:
df_corpus = pd.read_csv('df_corpus.csv')
df_corpus.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,categoría,sentimiento,tokens,cleaned_text,review_length
0,A2VGK9S4DKTMF,B002V9X78U,BWallace,"[1, 1]",Let's be clear; I love how much I hate this pr...,1,"Lame, even for a white elephant gift",1386720000,2013-12-11,beauty,0,"['lets', 'clear', 'love', 'much', 'hate', 'pro...",lets clear love much hate product bought white...,121
1,A1N2EONG2Y6NUZ,B0001EL5R2,Shana,"[0, 0]","This made my skin so greasy and shiny, in addi...",1,Hello greasy skin!,1373587200,2013-07-12,beauty,0,"['made', 'skin', 'greasy', 'shiny', 'addition'...",made skin greasy shiny addition helping acne d...,23
2,ANWZD7ZYE50UE,B005MZS03C,Gilberto Prieto,"[0, 5]",i receive the perfume today and they are fake ...,1,FAKE!!!,1379894400,2013-09-23,beauty,0,"['receive', 'perfume', 'today', 'fake', 'also'...",receive perfume today fake also bought gucci g...,40
3,A20RM3LL5IW5JO,B003UZ4R24,Oulzo,"[2, 7]",I got excited after seeing the multiple videos...,1,"Do not buy, the mask is full of alcohol!!!!!!!",1389398400,2014-01-11,beauty,0,"['got', 'excited', 'seeing', 'multiple', 'vide...",got excited seeing multiple videos youtube hea...,237
4,A3E3GD3TABXKU1,B0017TZD7S,Loren w Christensen,"[2, 5]","Maybe I don't get the point with this. Okay, t...",2,don't get its purpose,1215734400,2008-07-11,beauty,0,"['maybe', 'dont', 'get', 'point', 'okay', 'use...",maybe dont get point okay use sex sell get sup...,165


In [12]:
df_corpus.shape

(30000, 14)

In [13]:
df_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewerID      30000 non-null  object
 1   asin            30000 non-null  object
 2   reviewerName    29738 non-null  object
 3   helpful         30000 non-null  object
 4   reviewText      29983 non-null  object
 5   overall         30000 non-null  int64 
 6   summary         30000 non-null  object
 7   unixReviewTime  30000 non-null  int64 
 8   reviewTime      30000 non-null  object
 9   categoría       30000 non-null  object
 10  sentimiento     30000 non-null  int64 
 11  tokens          30000 non-null  object
 12  cleaned_text    29983 non-null  object
 13  review_length   30000 non-null  int64 
dtypes: int64(4), object(10)
memory usage: 3.2+ MB


In [14]:
# Vemos que tenemos valores nulos, por lo que vamos a tratarlos

# Valores nulos

In [15]:
# Al ser pocos, elimino los de la columna "reviewText"
df_corpus = df_corpus.dropna(subset=['reviewText'])

# Imputo valores nulos con una cadena vacía en las demás
df_corpus['reviewerName'] = df_corpus['reviewerName'].fillna('')
df_corpus['cleaned_text'] = df_corpus['cleaned_text'].fillna('')

# Preprocesado

In [16]:
# Función para eliminar tildes
def remove_accents(text):
    nfkd_form = unicodedata.normalize('NFKD', text)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

# Función para eliminar todo carácter que no sea una letra
def remove_non_letters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

# Función para eliminar stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords.words('english')]

# Función para lematizar tokens
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Función principal 
def preprocess_text(text):
    text = text.lower()
    text = remove_accents(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Con esto elimino URLs
    text = re.sub(r'\d+', '', text)  # Con esto elimino números
    text = re.sub(r'\s+', ' ', text)  # Con esto elimino espacios extra
    text = re.sub(r'\[.*?\]', '', text)  # Con esto elimino texto entre corchetes
    text = remove_non_letters(text)
    tokens = word_tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_tokens(tokens)

    return ' '.join(tokens)

df_corpus['cleaned_reviewText'] = df_corpus['reviewText'].apply(preprocess_text)

# Voy a mostrar algunas de las reviews preprocesadas
print(df_corpus[['reviewText', 'cleaned_reviewText']].head())

                                          reviewText  \
0  Let's be clear; I love how much I hate this pr...   
1  This made my skin so greasy and shiny, in addi...   
2  i receive the perfume today and they are fake ...   
3  I got excited after seeing the multiple videos...   
4  Maybe I don't get the point with this. Okay, t...   

                                  cleaned_reviewText  
0  let clear love much hate product bought white ...  
1  made skin greasy shiny addition helping acne d...  
2  receive perfume today fake also bought gucci g...  
3  got excited seeing multiple video youtube heav...  
4  maybe dont get point okay use sex sell get sup...  


In [17]:
# El preprocesamiento parece lógico y se ha llevado bien a cabo

Compruebo de nuevo las dimensiones del corpus para ver que todo está bien y lo guardo para el siguiente notebook

In [18]:
df_corpus.shape

(29983, 15)

In [19]:
df_corpus.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,categoría,sentimiento,tokens,cleaned_text,review_length,cleaned_reviewText
0,A2VGK9S4DKTMF,B002V9X78U,BWallace,"[1, 1]",Let's be clear; I love how much I hate this pr...,1,"Lame, even for a white elephant gift",1386720000,2013-12-11,beauty,0,"['lets', 'clear', 'love', 'much', 'hate', 'pro...",lets clear love much hate product bought white...,121,let clear love much hate product bought white ...
1,A1N2EONG2Y6NUZ,B0001EL5R2,Shana,"[0, 0]","This made my skin so greasy and shiny, in addi...",1,Hello greasy skin!,1373587200,2013-07-12,beauty,0,"['made', 'skin', 'greasy', 'shiny', 'addition'...",made skin greasy shiny addition helping acne d...,23,made skin greasy shiny addition helping acne d...
2,ANWZD7ZYE50UE,B005MZS03C,Gilberto Prieto,"[0, 5]",i receive the perfume today and they are fake ...,1,FAKE!!!,1379894400,2013-09-23,beauty,0,"['receive', 'perfume', 'today', 'fake', 'also'...",receive perfume today fake also bought gucci g...,40,receive perfume today fake also bought gucci g...
3,A20RM3LL5IW5JO,B003UZ4R24,Oulzo,"[2, 7]",I got excited after seeing the multiple videos...,1,"Do not buy, the mask is full of alcohol!!!!!!!",1389398400,2014-01-11,beauty,0,"['got', 'excited', 'seeing', 'multiple', 'vide...",got excited seeing multiple videos youtube hea...,237,got excited seeing multiple video youtube heav...
4,A3E3GD3TABXKU1,B0017TZD7S,Loren w Christensen,"[2, 5]","Maybe I don't get the point with this. Okay, t...",2,don't get its purpose,1215734400,2008-07-11,beauty,0,"['maybe', 'dont', 'get', 'point', 'okay', 'use...",maybe dont get point okay use sex sell get sup...,165,maybe dont get point okay use sex sell get sup...


In [20]:
df_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29983 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reviewerID          29983 non-null  object
 1   asin                29983 non-null  object
 2   reviewerName        29983 non-null  object
 3   helpful             29983 non-null  object
 4   reviewText          29983 non-null  object
 5   overall             29983 non-null  int64 
 6   summary             29983 non-null  object
 7   unixReviewTime      29983 non-null  int64 
 8   reviewTime          29983 non-null  object
 9   categoría           29983 non-null  object
 10  sentimiento         29983 non-null  int64 
 11  tokens              29983 non-null  object
 12  cleaned_text        29983 non-null  object
 13  review_length       29983 non-null  int64 
 14  cleaned_reviewText  29983 non-null  object
dtypes: int64(4), object(11)
memory usage: 3.7+ MB


In [21]:
df_corpus.to_csv('df_corpus.csv', index=False)

Todo parece estar listo para pasar al proceso de modelado. Usaremos nuestra columna procesada "cleaned_reviewText" como variable independiente para predecir el sentimiento.