# Pré-processamento dos dados textuais

## Transformando os dados em um dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_json('goodreads_books.json')
df_mini = df.sample(n=10)
df_mini

Unnamed: 0,book_id,description,genre,language_code
495332,23213961,Spn AU - Cas/Dean\nDean Winchester never wante...,romance,eng
464587,29083954,She was on the run.\nEmily Baxter has been run...,romance,eng
122225,20907840,"An endless love, for an endless price.\nJack's...",fantasy_paranormal,eng
237286,20739906,I was raised within the community of the New Z...,history_biography,eng
629235,16069167,Time is slipping away....\nTella Holloway is l...,young_adult,eng
311289,18822103,"""...well-done and undoubtably enjoyable..."" - ...",mystery_thriller_crime,eng
58271,18395466,This is the first printed collection of the Wo...,comics_graphic,eng
513536,16109004,"SKYE HIGH by LaVerne Thompson\n""Who knew that ...",romance,eng
627863,17155449,Sixteen-year-old Kara Nightingale's ordinary l...,young_adult,eng
265221,6554376,Weaving travel writing and historical research...,history_biography,en-GB


## Limpeza dos dados

In [3]:
%pip install unidecode



You should consider upgrading via the 'c:\users\vitor\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [4]:
import nltk
import unidecode
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

In [5]:
def remove_specials(tokens):
    return [unidecode.unidecode(word) for word in tokens]

In [6]:
def remove_punctuation(tokens):
    table = str.maketrans("","",string.punctuation)
    return [w.translate(table) for w in tokens]

In [7]:
def array_lower(tokens):
    return [w.lower() for w in tokens]

In [8]:
def remove_no_words(tokens):
    return [word for word in tokens if word.isalpha()]

In [9]:
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    return [w for w in tokens if not w in stop_words]

In [10]:
def text_clean(df_text_column):
    # Dividindo texto em tokens
    df_text_column = df_text_column.apply(word_tokenize)

    # Convertendo texto para minusculo
    df_text_column = df_text_column.apply(array_lower)

    # Removendo pontuação
    df_text_column = df_text_column.apply(remove_punctuation)

    # Removendo caracteres especiais
    df_text_column = df_text_column.apply(remove_specials)

    # Removendo tokens que não sao palavras
    df_text_column = df_text_column.apply(remove_no_words)

    # Removendo tokens que são stop words
    df_text_column = df_text_column.apply(remove_stop_words)

    return df_text_column


In [11]:
df_mini['description'] = text_clean(df_mini['description'])
df_mini

Unnamed: 0,book_id,description,genre,language_code
495332,23213961,"[spn, au, casdean, dean, winchester, never, wa...",romance,eng
464587,29083954,"[run, emily, baxter, running, stalker, determi...",romance,eng
122225,20907840,"[endless, love, endless, price, jack, immortal...",fantasy_paranormal,eng
237286,20739906,"[raised, within, community, new, zealand, seac...",history_biography,eng
629235,16069167,"[time, slipping, away, tella, holloway, losing...",young_adult,eng
311289,18822103,"[welldone, undoubtably, enjoyable, kirkus, rev...",mystery_thriller_crime,eng
58271,18395466,"[first, printed, collection, worsted, wear, co...",comics_graphic,eng
513536,16109004,"[skye, high, laverne, thompson, knew, travelin...",romance,eng
627863,17155449,"[sixteenyearold, kara, nightingale, ordinary, ...",young_adult,eng
265221,6554376,"[weaving, travel, writing, historical, researc...",history_biography,en-GB
