In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('data/book_review_labelled_data.csv')

In [25]:
df_train.head(5)

Unnamed: 0,reviewerID,reviewerName,reviewText,overall,summary,reviewTime,rates_count,helpful_count,rating
0,A3UPFTGAWZ3G2R,David J. Loftus,"Jenkins, a history professor and Member of Par...",4,"Quite readable, nicely done","12 6, 2001",40,37,4
1,A1XTKTLNSCRLDS,Ellen Rappaport,Detective Inspector Erlendur Sveinsson is at h...,5,Mesmerizing in depth,"02 23, 2014",0,0,5
2,A1A77B6DQQH436,"crescamp ""esc""",I didn't read this. I purchased it for a gift...,3,10-minute life lessons for kids,"02 12, 2013",3,0,3
3,AEAF4MRYHJZI,"Angelia Menchan ""acvermen.blogspot.com""",Fierce Angels by Sheri Park reads like a disse...,4,So FIERCE,"03 24, 2010",9,9,4
4,A3B7KU72LGWFER,"Grifel ""Tea Time""",Clearly this author had two goals in mind: 1) ...,1,Drivel!,"06 21, 2003",19,13,1


In [4]:
df_train.dtypes

reviewerID       object
reviewerName     object
reviewText       object
overall           int64
summary          object
reviewTime       object
rates_count       int64
helpful_count     int64
rating            int64
dtype: object

### Cleaning Data

In [54]:
import nltk
nltk.download('stopwords')

import pandas as pd
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin
import string
import re

[nltk_data] Downloading package stopwords to /home/tiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
def apply_tokenizer(data, tokenizer):
    """
    Returns a list of strings that is the tokenization of the given data by applying the given tokenizer.
    E.g. for an input ["This is a test!", "No, it can't be"],
      it returns ["This is a test !", "No , it can ' t be"]
    
    Args:
    data - list of strings containing the text to tokenize
    tokenizer - nltk tokenizer
    """
    #lista_tok = [" ".join(WordPunctTokenizer().tokenize(x)) for x in data]
    lista_tok = [" ".join(tokenizer.tokenize(x)) for x in data]
    #teste = WordPunctTokenizer().tokenize(data[1])
    
    #print(lista_tok[:2])
    return lista_tok
    # YOUR CODE HERE
    #raise NotImplementedError()

In [34]:
def apply_lowercase(data):
    """
    Returns a list of strings, with all the tokens lowercased.
    
    Args:
    data - list of strings to be lowercased
    """
    #print(data[1])
    list_lower = [x.lower() for x in data]
    #print(list_lower[1])
    return list_lower
    # YOUR CODE HERE
    #raise NotImplementedError()

In [35]:
stopword_list = stopwords.words('english')
def apply_filter_stopwords(data, stopword_list):
    """
    Returns a list of strings, where the strings do not contain any of
        the stopwords in the given list.
    
    Args:
    data - list of strings to filter stopwords from
    stopword_list - list of stopwords to filter out
    """
    
    # Filter the stopwords from the text
    data_no_stopwords = []
    for x in data:
        aux = " ".join([w for w in x.split() if w not in stopword_list])
        data_no_stopwords.append(aux)
    # YOUR CODE HERE
    #raise NotImplementedError()
    return data_no_stopwords


In [36]:
def apply_filter_punct(data):
    """
    Returns a list of strings, with no punctuation.
    
    Args:
    data - list of strings from which to remove punctuation
    """

    data_no_punct = []
    for x in data:
        
        aux = "".join([w for w in x if w not in string.punctuation])
        data_no_punct.append(aux)

    # YOUR CODE HERE
    #raise NotImplementedError()
    return data_no_punct

    # YOUR CODE HERE
    #raise NotImplementedError()

In [37]:
def normalize_whitespace(data):
    return [re.sub(r"^\s+|\s+$|(?<=\s)\s*", "", text) for text in data]

In [38]:
def apply_stemmer(data, stemmer):
    """
    Returns a list of strings, with stemmed data.
    
    Args:
    data - list with text to stem
    stemmer - instance of stemmer to use
    """
    
    list_tok = [WordPunctTokenizer().tokenize(x) for x in data]
    stems = [" ".join(list(map(stemmer.stem, y))) for y in list_tok]
    return stems
    # YOUR CODE HERE
    #raise NotImplementedError()

In [39]:
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, lower=True, remove_punct=True, stopwords=[], stemmer=None):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.lower = lower
        self.remove_punct = remove_punct
        self.stopwords = stopwords
    
    def clean_sentences(self, data):
                
        # Split sentence into list of words
        sentences_preprocessed = apply_tokenizer(data, self.tokenizer)
        # YOUR CODE HERE
        #raise NotImplementedError()
        
        # Lowercase
        if self.lower:
            sentences_preprocessed = apply_lowercase(sentences_preprocessed)
            # YOUR CODE HERE
            #raise NotImplementedError()

        if self.stopwords:
            sentences_preprocessed = apply_filter_stopwords(sentences_preprocessed,self.stopwords)
            # YOUR CODE HERE
            #raise NotImplementedError()
            
        # Remove punctuation
        if self.remove_punct:
            sentences_preprocessed = apply_filter_punct(sentences_preprocessed)
            # YOUR CODE HERE
            #raise NotImplementedError()
        
        # Normalize whitespace
        sentences_preprocessed = normalize_whitespace(sentences_preprocessed)
        # YOUR CODE HERE
        #raise NotImplementedError()
    
        # Stem words
        if self.stemmer:
            sentences_preprocessed = apply_stemmer(sentences_preprocessed,self.stemmer)
            # YOUR CODE HERE
            #raise NotImplementedError()

        return sentences_preprocessed


In [40]:
text_cleaner = TextCleanerTransformer(
    WordPunctTokenizer(),
    lower=True, 
    remove_punct=True, 
    stopwords=stopwords.words('english'),
    stemmer=SnowballStemmer("english"),
)

In [45]:
X_train_pre = text_cleaner.clean_sentences(df_train['reviewText'])

In [52]:
X_train_pre_summ = text_cleaner.clean_sentences(df_train['summary'])

In [51]:
print(X_train_pre[:2])f

['jenkin histori professor member parliament well author acclaim bio gladston present fine biographi britain greatest 20th centuri figur experi uniqu qualifi describ churchil polit fortun maneuv although american reader may find teen twenti either slow go suffici illumin britain odd polit system wherein politician regular shop around district repres even defeat anoth fair tradit public polit bio psychoanalysi impli churchil much person life expos move along surpris good clip despit 900 plus page jenkin fulli remind us churchil basic earn live writer contract write schedul royalti care record though polit avoc author write clean engag though seem inordin fond unnecessarili unusu word like quot psepholog quot quot rumbusti quot hand wit dri regular evid u hardcov edit farrar straus amp giroux clean halfway point whereupon one begin encount quot feburari quot 436 quot repli hard everi allow quot 553 quot shore quot 706 quot dimay quot 721 quot opposit could chose relax quot 837 8 similar 

In [53]:
print(X_train_pre_summ[:2])

['quit readabl nice done', 'mesmer depth']


In [21]:
#ABRIR PASTA FISICA DO ARQUIVO
!explorer.exe .