In [42]:
from get_dataset import dataset
import pandas as pd
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from math import log, sqrt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [2]:
data = dataset()
data

[(' Ele é o único pentacampeão mundial na categoria Laser: ganhou o mundial na Espanha em 1995, na África do Sul em 1996, no Chile em 1997, no México em 2000 e na Irlanda em 2001. Ufa! "Ué, o Scheidt não estudou, não?" Claro que sim! Scheidt soube conciliar o esporte com o estudo e, em 1996, concluiu o curso de administração de empresas pela Universidade Mackenzie, em São Paulo. E os títulos não acabaram! Foi campeão duas vezes nos jogos Pan-Americanos: em 1995 levou a medalha de ouro em Mar del Plata, na Argentina e em 1999 repetiu a dose em Winnipeg, no Canadá. Nas Olimpíadas de Atlanta (1996) levou o ouro e nas Olimpíadas de Sydney (2000) ficou com a prata. Vitórias não faltaram na vida desse campeão, hein? Não podemos dizer que o carnaval seja uma dança, mas sim uma festa. Uma festa em que o país inteiro, pessoas de todas as idades e classes sociais se unem numa só alegria para fazer o quê? Dançar. Por isso não podíamos deixar de falar dessa mania brasileira. O carnaval começou no 

In [3]:
dataset_df = pd.DataFrame(data)
dataset_df.columns = ['texto', 'nivel']
dataset_df.head()

Unnamed: 0,texto,nivel
0,Ele é o único pentacampeão mundial na categor...,1_Ensino_Fundamental_I
1,"Indicativo\nEis as flexões deste modo, em tem...",3_Ensino_Medio
2,"Santos católicos/Biografias\nEm sua vida, for...",3_Ensino_Medio
3,"Em 1950, o papa Pio XII elevou a Igreja de No...",3_Ensino_Medio
4,Logística/Serviço ao cliente/Método dos dois p...,4_Ensino_Superior


In [43]:
def limpar_texto(texto):
    stop_words = stopwords.words('portuguese')
    tokenizer = RegexpTokenizer(r'\b\w\w+\b')
    texto = re.sub(r'([^\s\w]|_)+', ' ', texto.lower())
    tokens = [token for token in tokenizer.tokenize(texto) if token not in stop_words]
    return ' '.join(tokens)

In [44]:
dataset_df['texto_limpo'] = dataset_df['texto'].apply(lambda x: limpar_texto(str(x)))
dataset_df['texto_limpo'].head()

0    único pentacampeão mundial categoria laser gan...
1    indicativo eis flexões deste modo tempos verbo...
2    santos católicos biografias vida relatados mui...
3    1950 papa pio xii elevou igreja senhora auxili...
4    logística serviço cliente método dois pontos m...
Name: texto_limpo, dtype: object

In [45]:
def obter_vocab(corpus):
    vocab = set()
    for texto in corpus:
        vocab = vocab.union(set(texto.split()))
    return sorted(vocab)

In [46]:
c2 = ['It was the best of times', 'it was the worst of times', 'it was the age of wisdom',
      'it was the age of foolishness']
obter_vocab(c2)

['It',
 'age',
 'best',
 'foolishness',
 'it',
 'of',
 'the',
 'times',
 'was',
 'wisdom',
 'worst']

In [47]:
obter_vocab(dataset_df['texto_limpo'])

KeyboardInterrupt: 

In [48]:
def bag_of_words(corpus):
    vocab = obter_vocab(corpus)
    bow = []
    for texto in corpus:
        t = set(texto.split())
        v = dict.fromkeys(vocab, 0)
        for palavra in t:
            v[palavra] = texto.split().count(palavra)
        bow.append(v)
    return bow

In [10]:
bow_df = pd.DataFrame(bag_of_words(dataset_df['texto_limpo']))
bow_df.head()

KeyboardInterrupt: 

In [49]:
def euclidean_norm(termo, array):
    norm = sqrt(sum([i**2 for i in array]))
    return termo / norm

In [None]:
euclidean_norm(3, [3, 0, 2.0986])

0.8194115275907136

In [50]:
def tfidf(corpus):
    tfidf_matrix = []
    bow = bag_of_words(corpus)
    vocab = obter_vocab(corpus)
    for texto in corpus:
        t = set(texto.split())
        linha = dict.fromkeys(vocab, 0)
        v = {}
        for palavra in t:
            tf = texto.split().count(palavra)
            n = len(corpus)
            df = sum([1 for linha in bow if linha[palavra] > 0])
            idf = log((n + 1) / (df + 1)) + 1
            tfidf = tf * idf
            v[palavra] = tfidf
        v = {palavra: euclidean_norm(tfidf, v.values()) for (palavra, tfidf) in v.items()}
        linha = linha | v
        tfidf_matrix.append(linha)
    return tfidf_matrix

In [51]:
tfidf_df = pd.DataFrame(tfidf(dataset_df['texto_limpo']))
tfidf_df.head()

Unnamed: 0,00,000,0000,000000,0000000000,000000000000000000000000,0001,00017,00042,001,...,πd,ρ0,ρdh,ρdv,ρdz,ρrt,ρv,ρva,ρψ,ωr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
tfidf_df['ainda'].head()

0    0.000000
1    0.010272
2    0.000000
3    0.000000
4    0.033700
Name: ainda, dtype: float64

In [59]:
tfidf_model = TfidfVectorizer()
tfidf_matrix = tfidf_model.fit_transform(dataset_df['texto_limpo']).todense()
tfidf_2_df = pd.DataFrame(tfidf_matrix)
tfidf_2_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_2_df['ainda'].head()

0    0.000000
1    0.010272
2    0.000000
3    0.000000
4    0.033700
Name: ainda, dtype: float64

In [33]:
def bag_of_words_2(corpus):
    bow_model = CountVectorizer(token_pattern=r'\b\w+\b')
    bow_matrix = bow_model.fit_transform(corpus).todense()
    bow_df = pd.DataFrame(bow_matrix)
    bow_df.columns = sorted(bow_model.vocabulary_)
    return bow_df

In [34]:
print(c2)
bag_of_words_2(c2).to_dict()

['It was the best of times', 'it was the worst of times', 'it was the age of wisdom', 'it was the age of foolishness 0']


{'0': {0: 0, 1: 0, 2: 0, 3: 1},
 'age': {0: 0, 1: 0, 2: 1, 3: 1},
 'best': {0: 1, 1: 0, 2: 0, 3: 0},
 'foolishness': {0: 0, 1: 0, 2: 0, 3: 1},
 'it': {0: 1, 1: 1, 2: 1, 3: 1},
 'of': {0: 1, 1: 1, 2: 1, 3: 1},
 'the': {0: 1, 1: 1, 2: 1, 3: 1},
 'times': {0: 1, 1: 1, 2: 0, 3: 0},
 'was': {0: 1, 1: 1, 2: 1, 3: 1},
 'wisdom': {0: 0, 1: 0, 2: 1, 3: 0},
 'worst': {0: 0, 1: 1, 2: 0, 3: 0}}

In [27]:
bow = bag_of_words_2(dataset_df['texto_limpo']).head()

In [35]:
bow['h']

KeyError: 'h'

In [36]:
def tfidf_2(corpus):
    tfidf_matrix = []
    bow = bag_of_words_2(corpus).to_dict()
    vocab = bow.keys()
    for texto in corpus:
        t = set(texto.lower().split())
        linha = dict.fromkeys(vocab, 0)
        v = {}
        for palavra in t:
            tf = texto.split().count(palavra)
            n = len(corpus)
            df = sum([1 for i in bow[palavra] if bow[palavra][i] > 0])
            idf = log((n + 1) / (df + 1)) + 1
            tfidf = tf * idf
            v[palavra] = tfidf
        v = {palavra: euclidean_norm(tfidf, v.values()) for (palavra, tfidf) in v.items()}
        linha = linha | v
        tfidf_matrix.append(linha)
    return tfidf_matrix

In [37]:
tfidf_2(c2)

[{'0': 0,
  'age': 0,
  'best': 0.6403749295935449,
  'foolishness': 0,
  'it': 0.0,
  'of': 0.3341742038105307,
  'the': 0.3341742038105307,
  'times': 0.5048789499185483,
  'was': 0.3341742038105307,
  'wisdom': 0,
  'worst': 0},
 {'0': 0,
  'age': 0,
  'best': 0,
  'foolishness': 0,
  'it': 0.3169454420370736,
  'of': 0.3169454420370736,
  'the': 0.3169454420370736,
  'times': 0.4788492951654494,
  'was': 0.3169454420370736,
  'wisdom': 0,
  'worst': 0.6073596130854014},
 {'0': 0,
  'age': 0.4788492951654494,
  'best': 0,
  'foolishness': 0,
  'it': 0.3169454420370736,
  'of': 0.3169454420370736,
  'the': 0.3169454420370736,
  'times': 0,
  'was': 0.3169454420370736,
  'wisdom': 0.6073596130854014,
  'worst': 0},
 {'0': 0.5191134919154226,
  'age': 0.40927503962898937,
  'best': 0,
  'foolishness': 0.5191134919154226,
  'it': 0.2708949551760987,
  'of': 0.2708949551760987,
  'the': 0.2708949551760987,
  'times': 0,
  'was': 0.2708949551760987,
  'wisdom': 0,
  'worst': 0}]

In [38]:
tfidf_df = pd.DataFrame(tfidf_2(dataset_df['texto_limpo']))
tfidf_df.head()

Unnamed: 0,0,00,000,0000,000000,0000000000,000000000000000000000000,0001,00017,00042,...,τ,φ,ψ,ω,ωr,ύ,ф,क,म,ष
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
tfidf_df['ainda'].head()

0    0.000000
1    0.010272
2    0.000000
3    0.000000
4    0.033696
Name: ainda, dtype: float64