# Etapa de preprocesado de texto

In [1]:
# Import of the libraries
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Import of the sampled data 
df = pd.read_csv('/content/sampled_data_file.csv')

In [3]:
# Downloading the stopwords list 
nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords = set(stopwords) - set(['not', 'no', 'nor', 'but'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Download the punkt ressource 
nltk.download('punkt')  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Download the WordNet database 
nltk.download('wordnet')
nltk.download('omw-1.4')
# Create the lemmatizer object 
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [6]:
# Create the function to preprocess the text 

def preprocess(text):
  # get all the text in lowercase letters
  text = text.lower()
  # remove punctuation 
  text = text.translate(str.maketrans("", "", string.punctuation))
  # tokenize the text 
  text = nltk.word_tokenize(text)
  # remove stopwords
  text =  [word for word in text if word not in stopwords]
  # lemmatize the text 
  text = [lemmatizer.lemmatize(token) for token in text]
  return text

In [7]:
df['reviewText'][0]

'This is a pretty good game! My daughter loves all the different unique features that it offers. I gave it four stars because of the acting feature. This was the main reason why we got it since she loves acting. Well they do not offer much variety on this feature. Pretty much all your doing is posing rather than acting! But other than that, it was okay all around.'

In [8]:
frase = df['reviewText'][0]
preprocess(frase)

['pretty',
 'good',
 'game',
 'daughter',
 'love',
 'different',
 'unique',
 'feature',
 'offer',
 'gave',
 'four',
 'star',
 'acting',
 'feature',
 'main',
 'reason',
 'got',
 'since',
 'love',
 'acting',
 'well',
 'not',
 'offer',
 'much',
 'variety',
 'feature',
 'pretty',
 'much',
 'posing',
 'rather',
 'acting',
 'but',
 'okay',
 'around']

In [9]:
# Apply the preprocess on the data
df['reviewText'] = df['reviewText'].apply(preprocess)

In [10]:
# create new csv with the preprocessed text, limited to the text and to the reviews 
df_preprocessed = df[['overall', 'reviewText']]
df_preprocessed.to_csv('final.csv', index=False)