# Pré-processamento dos dados textuais

## Transformando os dados em um dataset

In [33]:
import pandas as pd

In [34]:
df = pd.read_json('goodreads_books.json')
df_mini = df.sample(n=10)
df_mini

Unnamed: 0,book_id,description,genre,language_code
111193,7036848,Kobato's mission to fill up her magic bottle m...,fantasy_paranormal,eng
44939,23167768,Araminta Ross was born a slave in Delaware in ...,comics_graphic,eng
334762,8245230,"Learning that her mother, whom she had thought...",mystery_thriller_crime,eng
21654,3823823,Meet the World's Funniest Kindergartner --Juni...,children,eng
116885,25756018,"Sharakhai, the great city of the desert, cente...",fantasy_paranormal,eng
323533,15808340,"As thirty-fifth in line for the throne, Lady G...",mystery_thriller_crime,eng
465818,25626525,I'm Harlow Evans. Shortly after my parents wer...,romance,en-US
344283,920431,The Gulf War is imminent and there's something...,mystery_thriller_crime,en-GB
616716,28380088,Lia and Rafe have escaped Venda and the path b...,young_adult,eng
293585,22709633,"Gifted with the magical ability of song, young...",history_biography,eng


## Limpeza dos dados

In [35]:
!pip install unidecode



You should consider upgrading via the 'c:\users\vitor\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.





In [36]:
import nltk
import unidecode
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

In [37]:
def remove_specials(tokens):
    return [unidecode.unidecode(word) for word in tokens]

In [38]:
def remove_punctuation(tokens):
    table = str.maketrans("","",string.punctuation)
    return [w.translate(table) for w in tokens]

In [39]:
def array_lower(tokens):
    return [w.lower() for w in tokens]

In [40]:
def remove_no_words(tokens):
    return [word for word in tokens if word.isalpha()]

In [41]:
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    return [w for w in tokens if not w in stop_words]

In [42]:
def text_clean(df_text_column):
    # Dividindo texto em tokens
    df_text_column = df_text_column.apply(word_tokenize)

    # Convertendo texto para minusculo
    df_text_column = df_text_column.apply(array_lower)

    # Removendo pontuação
    df_text_column = df_text_column.apply(remove_punctuation)

    # Removendo caracteres especiais
    df_text_column = df_text_column.apply(remove_specials)

    # Removendo tokens que não sao palavras
    df_text_column = df_text_column.apply(remove_no_words)

    # Removendo tokens que são stop words
    df_text_column = df_text_column.apply(remove_stop_words)

    return df_text_column


In [43]:
df_mini['description'] = text_clean(df_mini['description'])
df_mini

Unnamed: 0,book_id,description,genre,language_code
111193,7036848,"[kobato, mission, fill, magic, bottle, might, ...",fantasy_paranormal,eng
44939,23167768,"[araminta, ross, born, slave, delaware, early,...",comics_graphic,eng
334762,8245230,"[learning, mother, thought, long, dead, recent...",mystery_thriller_crime,eng
21654,3823823,"[meet, world, funniest, kindergartner, junie, ...",children,eng
116885,25756018,"[sharakhai, great, city, desert, center, comme...",fantasy_paranormal,eng
323533,15808340,"[thirtyfifth, line, throne, lady, georgiana, r...",mystery_thriller_crime,eng
465818,25626525,"[harlow, evans, shortly, parents, murdered, mo...",romance,en-US
344283,920431,"[gulf, war, imminent, something, mighty, stran...",mystery_thriller_crime,en-GB
616716,28380088,"[lia, rafe, escaped, venda, path, winding, dan...",young_adult,eng
293585,22709633,"[gifted, magical, ability, song, young, lunora...",history_biography,eng
