# Pré-processamento dos dados textuais

## Transformando os dados em um dataset

In [1]:
!pip install pandas
!pip install unidecode
!pip install nltk
!pip install sklearn



In [2]:
import pandas as pd
df_base = pd.read_json('goodreads_books.json')
df_mini = df_base.sample(n=1000, random_state=1)
df_mini.head()

## Limpeza dos dados

In [5]:
import nltk
import unidecode
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

In [12]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/igor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/igor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def remove_specials(tokens):
    return [unidecode.unidecode(word) for word in tokens]

def remove_punctuation(tokens):
    table = str.maketrans("","",string.punctuation)
    return [w.translate(table) for w in tokens]

def array_lower(tokens):
    return [w.lower() for w in tokens]

def remove_no_words(tokens):
    return [word for word in tokens if word.isalpha()]

def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    return [w for w in tokens if not w in stop_words]

def text_clean(df_text_column, log=False):
    if log: print("Starting")

    # Dividindo texto em tokens
    df_text_column = df_text_column.apply(word_tokenize)

    if log: print("Tokens split done")  # Some log

    # Convertendo texto para minusculo
    df_text_column = df_text_column.apply(array_lower)

    if log: print("Upper letters converted to normal")

    # Removendo pontuação
    df_text_column = df_text_column.apply(remove_punctuation)

    if log: print("Removed punctiation")

    # Removendo caracteres especiais
    df_text_column = df_text_column.apply(remove_specials)

    if log: print("Specials chars removed")

    # Removendo tokens que não sao palavras
    df_text_column = df_text_column.apply(remove_no_words)

    if log: print("Removed non-words tokens")

    # Removendo tokens que são stop words
    df_text_column = df_text_column.apply(remove_stop_words)

    if log: print("Removed tokens that are stop-words\nFinished")

    return df_text_column


In [13]:
# Auxiliar log function
def size_df(d):
    print("Size=", d.memory_usage(deep=True).sum()/10**9, "GB")

In [14]:
# raw_df = df_mini.copy()
# df_mini['description'] = text_clean(df_mini['description'])
# df_mini.head()

Unnamed: 0,book_id,description,genre,language_code
51285,27833724,"[tang, tu, nigao, bai, shiteshimatsutashan, ti...",comics_graphic,eng
615323,24754076,"[sixteen, yearold, madeline, struggled, epilep...",young_adult,eng
118404,8737174,"[epic, story, thomas, caleintroduced, memorabl...",fantasy_paranormal,eng
574805,15831501,"[father, thought, business, came, first, daugh...",romance,eng
369607,892602,"[last, night, dreamt, went, manderley, novel, ...",mystery_thriller_crime,eng


In [15]:
# df_base.loc[51285, 'description'] # Noisy data

'Tang Tu niGao Bai shiteshimatsutaShan Tian . soreniDui suruuraranoFan Shi haJing kubekimonodatsuta!? uraranoFan Shi woWen itaShan Tian ha, Bi Nu noJi Yi woLi sukotowoJue Yi !! demo, Ji Yi woLi suFang Fa gasatsupariwakannee!! Shan Tian ha, hitomazuXiao Tian Qie noNeng Li woJu u....ga!'

# Stemming

In [16]:
from nltk.stem.porter import PorterStemmer

In [17]:
porter = PorterStemmer()
stemmized = lambda d: [porter.stem(tk) for tk in d]
df_mini['description'] = df_mini['description'].apply(stemmized)
df_mini.head()

Unnamed: 0,book_id,description,genre,language_code
51285,27833724,"[tang, tu, nigao, bai, shiteshimatsutashan, ti...",comics_graphic,eng
615323,24754076,"[sixteen, yearold, madelin, struggl, epilepsi,...",young_adult,eng
118404,8737174,"[epic, stori, thoma, caleintroduc, memor, left...",fantasy_paranormal,eng
574805,15831501,"[father, thought, busi, came, first, daughter,...",romance,eng
369607,892602,"[last, night, dreamt, went, manderley, novel, ...",mystery_thriller_crime,eng


# TFIDF


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Using the hand made preprocessor

In [19]:
do_nothing = lambda x: x
vect_manual = TfidfVectorizer(tokenizer=do_nothing, lowercase=False, preprocessor=do_nothing)

### Using only the class from sklearn

In [34]:
vect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1, 1))
freq = vect.fit_transform(df_mini['description'].values.tolist())
df_freq = pd.DataFrame(data=freq.todense(), columns=vect.get_feature_names_out())
df_freq.head()